import jsondef load_data(file):withopen(file, 'r', encoding='utf-8') as f:#print(f)return(json.load(f))def save_data (file, data):withopen (file, "w", encoding='utf-8') as f: data = json.dump(data, f, indent=4)#print(data)
Code
#dont mess with the codeblock box # 1 NER Spacy create training datadef create_training_data(file, type):#print(type) #gpe#print(file) # TexasNER_GPE_master data = load_data(file)#print(data) patterns=[]for key, value in data.items():#print(key)if key =="List of Municipalities":for obj in value: #muni loop#print(obj.get("Municipality")) pattern = {"label":type,"pattern": obj.get("Municipality") } patterns.append(pattern) # end of muni loopfor obj in value: #primary county loop #if any(d.get("Primary County") == "Gonzales" for d in value): #print("found in Municipality") ifnotany(d.get("Municipality") == obj.get("Primary County") for d in value):#print('not found in Manicipality') pattern = {"label":type,"pattern": obj.get("Primary County") } patterns.append(pattern) if key =="List of Unincorporated Communities":for obj in value: #unicorporated loop#print(obj.get("Unincorporated Community")) pattern = {"label":type,"pattern": obj.get("Unincorporated Community") } patterns.append(pattern) # end of community name loopfor obj in value: #county loop ifnotany(d.get("Unincorporated Community") == obj.get("Primary County") for d in value): pattern = {"label":type,"pattern": obj.get("Primary County") } patterns.append(pattern) if key =="List of Ghost Town":for obj in value: #ghost town loop#print(obj.get("Ghost town")) pattern = {"label":type,"pattern": obj.get("Ghost town") } patterns.append(pattern) # end of muni loopfor obj in value: #county loop ifnotany(d.get("Ghost town") == obj.get("Primary County") for d in value): pattern = {"label":type,"pattern": obj.get("Primary County") } patterns.append(pattern)#for obj in value: # unicorpated communities loop#if any(d.get("Primary County")=='Karnes' for d in value): #print("found in Primary County")#print(pattern)#print(patterns)return(patterns)patterns = create_training_data('../Corridos/data/TexasNER_GPE_master.json', 'GPE')#print(patterns)
Code
def generate_rules(patterns): nlp = English() ruler = EntityRuler(nlp) ruler = nlp.add_pipe('entity_ruler', config={"validate": True}) ruler.add_patterns(patterns) nlp.to_disk("tx_trained_ner")def test_model(model, text): doc = nlp(text) results = [] entities = []for ent in doc.ents: entities.append((ent.start_char, ent.end_char, ent.label_))iflen(entities) >0: #if entity has been found move entity to results and entity list results = [text, {"entities": entities}]#print(results)return (results)generate_rules(patterns)#print(patterns)#how spacy wants to see read the data##TRAIN_DATA = [(text, {"entities": [(start, end, label)]})]#nlp = spacy.load("tx_trained_ner")TRAIN_DATA= []withopen ("elcorridodegregoriocortez.txt", "r", encoding='utf-8')as f: text = f.read()#print(text) segments = text.split("\n\n")[0:]#cleaning up the lyric text. making it easier to read for the programfor segment in segments: segment = segment.strip() #might not need this code line segment = segment.replace("\n", " ")# might not need this code line#print(segment) punc ='[":;,“.”[@_!$%^&*()<>?/\|}{~:]#]'for ele in segment:if ele in punc: segment = segment.replace(ele, "")#print(segment) results = test_model(nlp, segment)if results != []: #it found something and return it TRAIN_DATA.append(results)print(results)#print(TRAIN_DATA)save_data("data/TexasNER_GPE_trained.json", TRAIN_DATA)
['In the country of Karnes Look what has happened The Major Sheriff died Leaving Román badly wounded', {'entities': [(18, 24, 'GPE')]}]
[]
[]
[]
[]
[]
[]
[]
[]
['He struck out for Gonzales Without showing any fear Follow me cowardly rangers I am Gregorio Cortez', {'entities': [(18, 26, 'GPE')]}]
['From Belmont he went to the ranch They succeeded in surrounding him Quite a few more than three hundred But there he jumped their corral', {'entities': [(5, 12, 'GPE')]}]
[]
[]
['Gregorio Cortez went out He went towards Laredo They decided not to follow Because they were afraid of him', {'entities': [(42, 48, 'GPE')]}]
[]
[]
['Over by El Encinal According to what we hear They made him a corral And he killed them another sheriff', {'entities': [(11, 18, 'GPE')]}]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
Code
def train_spacy(data, iterations): #passing training data/set and the number/generation of the training process. 30 is good. long time!!!!#print(data) TRAIN_DATA = data#print(TRAIN_DATA) nlp = spacy.blank("en")#print(nlp)if"ner"notin nlp.pipe_names: ner = nlp.create_pipe("ner") #if there is no ner in the pipe, create a pipe nlp.add_pipe("ner", last=True)for _, annotations in TRAIN_DATA: #add labelsprint(annotations.get("entities"))for ent in annotations.get("entities"): ner.add_label(ent[2]) # i'm only working with one label 'GPE' if more increase i believe double check other_pipes = [pipe for pipe in nlp.pipe_names if pipe !="ner"] with nlp.disable_pipes(*other_pipes): #won't mess up other pipes optimizer = nlp.begin_training()for itn inrange(iterations): # adjust the funtion not the items inside. arg of interations 30 is goodprint("Starting iteration "+str(itn)) #where I am at in the interations process random.shuffle(TRAIN_DATA) #shuffle helps the program to learn not memorize order. losses = {}for text, annotations in TRAIN_DATA: doc = nlp.make_doc(text) example = Example.from_dict(doc, annotations) #fix found here for ValueError: [E973] Unexpected type for NER data https://github.com/explosion/spaCy/issues/7038 nlp.update([example], drop =0.2, sgd= optimizer, losses=losses) #print(losses) return(nlp) #model returnTRAIN_DATA = load_data("data/TexasNER_GPE_trained.json")#print(TRAIN_DATA)nlp = train_spacy(TRAIN_DATA, 30)nlp.to_disk("tx_ner_model")