fasttext

`fasttext` ¶

Classes:

Name	Description
`FTEntityMatcher`

Attributes:

Name	Type	Description
`manifest`		Defines FTEntityMatcher model files

`manifest = ModelManifest(model='fasttext', version='3', sources=[ObjectRef(key='pos_neg_terms', file_name='FT_posneg_terms.pickle'), ObjectRef(key='ft_word_vecs', file_name='FTwordvecsnormPCA.pickle'), ObjectRef(key='ft_ngram_vecs', file_name='FTngramvecsPCA.pickle')], visibility=(Visibility.INTERNAL))` `module-attribute` ¶

Defines FTEntityMatcher model files

Changelog

v1: initial model release
v2: update pos_neg_terms to include state and county tags (GC-59)
v3: FP fixes: "capital_loss" and "capital_gain" are no longer marked as locations

`FTEntityMatcher(*, pos_neg_terms, ft_word_vecs, ft_ngram_vecs)` ¶

Methods:

Name	Description
`compute_ngrams_bytes`	From fasttext
`ft_hash_bytes`	Reproduces dictionary used in fastText.
`norm`	Normalize vector
`get_ft_vec`	Get FastText vector for a word. If it's OOV, gather the
`vec_sim`	Compute the cosing similarity between two vectors
`ent_score`	Determine NERPrediction.score. FT models should all have the same max_score

Attributes:

Name	Type	Description
`max_score`	`float`	All fasttext predictions are assigned the max_score
`VEC_SIM_SCORE`	`str`	Default key on spacy doc where vector similarity score is stored

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py

def __init__(self, *, pos_neg_terms: dict, ft_word_vecs: dict, ft_ngram_vecs: dict):
    if spacy is None:
        raise RuntimeError("spacy module is not installed")

    if np is None or dot is None:
        raise RuntimeError("numpy is not installed")

    self.nlp = spacy.blank("en")
    self.posterms = pos_neg_terms["pos"]
    self.negterms = pos_neg_terms["neg"]
    self.tags = pos_neg_terms["tag"]
    self.lastword = pos_neg_terms["last"]
    self.streetsuff = pos_neg_terms["streetSuff"]
    self.ft_word_vecs = ft_word_vecs
    self.ft_ngram_vecs = ft_ngram_vecs

`max_score = 0.8` `class-attribute` `instance-attribute` ¶

All fasttext predictions are assigned the max_score

`VEC_SIM_SCORE = 'VEC_SIM_SCORE'` `class-attribute` `instance-attribute` ¶

Default key on spacy doc where vector similarity score is stored

`compute_ngrams_bytes(word, min_n, max_n)` ¶

From fasttext

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py

def compute_ngrams_bytes(self, word, min_n, max_n):
    """From fasttext"""
    utf8_word = word.encode("utf-8")
    num_bytes = len(utf8_word)
    n = 0
    _MB_MASK = 0xC0
    _MB_START = 0x80
    ngrams = []
    for i in range(num_bytes):
        ngram = []
        if utf8_word[i] & _MB_MASK == _MB_START:
            continue
        j, n = i, 1
        while j < num_bytes and n <= max_n:
            ngram.append(utf8_word[j])
            j += 1
            while j < num_bytes and (utf8_word[j] & _MB_MASK) == _MB_START:
                ngram.append(utf8_word[j])
                j += 1
            if n >= min_n and not (n == 1 and (i == 0 or j == num_bytes)):
                ngrams.append(bytes(ngram))
            n += 1
    return ngrams

`ft_hash_bytes(bytez)` ¶

Reproduces dictionary used in fastText.

source

https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py

def ft_hash_bytes(self, bytez):
    """Reproduces dictionary used in fastText.

    source:
        https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc
    """
    # Runtime warnings for integer overflow are raised, this is expected
    # behavior. These warnings are suppressed.
    old_settings = np.seterr(all="ignore")
    h = np.uint32(2166136261)
    for b in bytez:
        h = h ^ np.uint32(np.int8(b))
        h = h * np.uint32(16777619)
    np.seterr(**old_settings)
    return h

`norm(vec)` `staticmethod` ¶

Normalize vector

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py

@staticmethod
def norm(vec):
    """Normalize vector"""
    length = 1.0 * np.sqrt(np.sum(vec**2))
    if length == 0:
        return vec
    else:
        return vec / length

`get_ft_vec(word)` ¶

Get FastText vector for a word. If it's OOV, gather the vectors for it's ngrams and average them

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py

def get_ft_vec(self, word):
    """
    Get FastText vector for a word.  If it's OOV, gather the
    vectors for it's ngrams and average them
    """
    vec = [0] * 100
    # If FT already knows about this word, go ahead and get it's vector
    if word in self.ft_word_vecs:
        vec = self.ft_word_vecs[word]
    else:
        # Otherwise, first compute the ngrams of the word
        encoded_ngrams = self.compute_ngrams_bytes(word, 3, 3)

        # Translate the ngrams into into an index
        ngram_hashes = [self.ft_hash_bytes(n) % 100000 for n in encoded_ngrams]
        if len(ngram_hashes) == 0:
            return vec
        # For each ngram index, get the precomputed vector embedding,
        # sum the vectors and get the average which results in a vector
        # for this OOV word
        for nh in ngram_hashes:
            vec += self.ft_ngram_vecs[nh]
        vec = self.norm(vec / len(ngram_hashes))

    return vec

`vec_sim(a, b)` ¶

Compute the cosing similarity between two vectors

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py

def vec_sim(self, a, b):
    """Compute the cosing similarity between two vectors"""
    vec1 = np.array(self.get_ft_vec(a))
    vec2 = np.array(self.get_ft_vec(b))
    # return dot(self.norm(vec1), self.norm(vec2))
    return dot(vec1, vec2)

`ent_score(doc, ent)` `staticmethod` ¶

Determine NERPrediction.score. FT models should all have the same max_score

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py

@staticmethod
def ent_score(doc, ent) -> float:
    """Determine NERPrediction.score. FT models should all have the same max_score"""
    return FTEntityMatcher.max_score

fasttext

fasttext ¶

FTEntityMatcher(*, pos_neg_terms, ft_word_vecs, ft_ngram_vecs) ¶

max_score = 0.8 class-attribute instance-attribute ¶

VEC_SIM_SCORE = 'VEC_SIM_SCORE' class-attribute instance-attribute ¶

compute_ngrams_bytes(word, min_n, max_n) ¶

ft_hash_bytes(bytez) ¶

norm(vec) staticmethod ¶

get_ft_vec(word) ¶

vec_sim(a, b) ¶

ent_score(doc, ent) staticmethod ¶

`fasttext` ¶

`FTEntityMatcher(*, pos_neg_terms, ft_word_vecs, ft_ngram_vecs)` ¶

`max_score = 0.8` `class-attribute` `instance-attribute` ¶

`VEC_SIM_SCORE = 'VEC_SIM_SCORE'` `class-attribute` `instance-attribute` ¶

`compute_ngrams_bytes(word, min_n, max_n)` ¶

`ft_hash_bytes(bytez)` ¶

`norm(vec)` `staticmethod` ¶

`get_ft_vec(word)` ¶

`vec_sim(a, b)` ¶

`ent_score(doc, ent)` `staticmethod` ¶