Build a Name Entity Recognition Model with SpaCy


March 17, 2023

import spacy
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.pipeline import EntityRuler
import json
import random
from spacy.tokens import Doc
from spacy.training import Example
from spacy.language import Language
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
def save_data (file, data):
    with open (file, "w", encoding='utf-8') as f:
       data = json.dump(data, f, indent=4)
#dont mess with the codeblock box 
# 1 NER Spacy create training data

def create_training_data(file, type):
    #print(type) #gpe
    #print(file) # TexasNER_GPE_master
    data = load_data(file)
    for key, value in data.items():
        if key == "List of Municipalities":
            for obj in value: #muni loop
                pattern = {
                    "pattern": obj.get("Municipality")
                patterns.append(pattern) # end of muni loop
            for obj in value: #primary county loop 
                #if any(d.get("Primary County") == "Gonzales" for d in value): 
                    #print("found in Municipality") 
                if not any(d.get("Municipality") == obj.get("Primary County") for d in value):
                    #print('not found in Manicipality')
                    pattern = {
                        "pattern": obj.get("Primary County")

        if key == "List of Unincorporated Communities":
            for obj in value: #unicorporated loop
                #print(obj.get("Unincorporated Community"))
                pattern = {
                    "pattern": obj.get("Unincorporated Community")
                patterns.append(pattern) # end of community name loop
            for obj in value: #county loop 
                if not any(d.get("Unincorporated Community") == obj.get("Primary County") for d in value):
                    pattern = {
                        "pattern": obj.get("Primary County")

        if key == "List of Ghost Town":
            for obj in value: #ghost town loop
                #print(obj.get("Ghost town"))
                pattern = {
                    "pattern": obj.get("Ghost town")
                patterns.append(pattern) # end of muni loop
            for obj in value: #county loop 
                if not any(d.get("Ghost town") == obj.get("Primary County") for d in value):
                    pattern = {
                        "pattern": obj.get("Primary County")

        #for obj in value: # unicorpated communities loop
                #if any(d.get("Primary County")=='Karnes' for d in value): 
                    #print("found in Primary County")

patterns = create_training_data('../Corridos/data/TexasNER_GPE_master.json', 'GPE')
def generate_rules(patterns):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler = nlp.add_pipe('entity_ruler', config={"validate": True}) 

def test_model(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if len(entities) >0: #if entity has been found move entity to results and entity list
        results = [text, {"entities": entities}]
    return (results)


#how spacy wants to see read the data#
#TRAIN_DATA = [(text, {"entities": [(start, end, label)]})]#

nlp = spacy.load("tx_trained_ner")

with open ("elcorridodegregoriocortez.txt", "r", encoding='utf-8')as f:
    text = f.read()
    segments = text.split("\n\n")[0:]
    #cleaning up the lyric text. making it easier to read for the program
    for segment in segments:
        segment = segment.strip() #might not need this code line 
        segment = segment.replace("\n", " ")# might not need this code line
        punc = '[":;,“.”[@_!$%^&*()<>?/\|}{~:]#]'
        for ele in segment:
            if ele in punc:
                segment = segment.replace(ele, "")


        results = test_model(nlp, segment)
        if results != []: #it found something and return it

save_data("data/TexasNER_GPE_trained.json", TRAIN_DATA)
['In the country of Karnes  Look what has happened  The Major Sheriff died Leaving Román badly wounded', {'entities': [(18, 24, 'GPE')]}]
['He struck out for Gonzales Without showing any fear Follow me cowardly rangers  I am Gregorio Cortez', {'entities': [(18, 26, 'GPE')]}]
['From Belmont he went to the  ranch   They succeeded in surrounding  him  Quite a few more than three hundred  But there he jumped their corral', {'entities': [(5, 12, 'GPE')]}]
['Gregorio Cortez went out  He went towards Laredo They decided not to follow  Because they were afraid of him', {'entities': [(42, 48, 'GPE')]}]
['Over by El Encinal  According to what we hear  They made him a corral  And he killed them another sheriff', {'entities': [(11, 18, 'GPE')]}]
def train_spacy(data, iterations): #passing training data/set and the number/generation of the training process. 30 is good. long time!!!!
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner") #if there is no ner in the pipe, create a pipe
        nlp.add_pipe("ner", last=True)
    for _, annotations in TRAIN_DATA: #add labels
        for ent in annotations.get("entities"):
            ner.add_label(ent[2]) # i'm only working with one label 'GPE' if more increase i believe double check
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]  
    with nlp.disable_pipes(*other_pipes): #won't mess up other pipes
        optimizer = nlp.begin_training()
        for itn in range(iterations): # adjust the funtion not the items inside. arg of interations 30 is good
            print("Starting iteration "+ str(itn)) #where I am at in the interations process
            random.shuffle(TRAIN_DATA) #shuffle helps the program to learn not memorize order. 
            losses = {}
            for text, annotations in TRAIN_DATA:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations) #fix found here for ValueError: [E973] Unexpected type for NER data https://github.com/explosion/spaCy/issues/7038
                nlp.update([example], drop = 0.2, sgd= optimizer, losses=losses)            
    return(nlp) #model return

TRAIN_DATA = load_data("data/TexasNER_GPE_trained.json")
nlp = train_spacy(TRAIN_DATA, 30)
test= "elcorridodegregoriocortez.txt"
with open(test, 'r', encoding='utf-8') as c:
    test = c.read()

import re

def clean_text(text):
    cleaned= re.sub(r'[":;,.“”]', "", text)
test = clean_text(test)

nlp = spacy.load("tx_ner_model")
doc =nlp(test) 
for ent in doc.ents:
    #print(ent.text, ent.label_)
    if ent.label_ == "GPE":
['Karnes', 'Gonzales', 'Belmont', 'Laredo', 'Encinal']