Skip to content

fasttext

fasttext

Classes:

Name Description
FTEntityMatcher

Attributes:

Name Type Description
manifest

Defines FTEntityMatcher model files

manifest = ModelManifest(model='fasttext', version='3', sources=[ObjectRef(key='pos_neg_terms', file_name='FT_posneg_terms.pickle'), ObjectRef(key='ft_word_vecs', file_name='FTwordvecsnormPCA.pickle'), ObjectRef(key='ft_ngram_vecs', file_name='FTngramvecsPCA.pickle')], visibility=(Visibility.INTERNAL)) module-attribute

Defines FTEntityMatcher model files

Changelog
  • v1: initial model release
  • v2: update pos_neg_terms to include state and county tags (GC-59)
  • v3: FP fixes: "capital_loss" and "capital_gain" are no longer marked as locations

FTEntityMatcher(*, pos_neg_terms, ft_word_vecs, ft_ngram_vecs)

Methods:

Name Description
compute_ngrams_bytes

From fasttext

ft_hash_bytes

Reproduces dictionary used in fastText.

norm

Normalize vector

get_ft_vec

Get FastText vector for a word. If it's OOV, gather the

vec_sim

Compute the cosing similarity between two vectors

ent_score

Determine NERPrediction.score. FT models should all have the same max_score

Attributes:

Name Type Description
max_score float

All fasttext predictions are assigned the max_score

VEC_SIM_SCORE str

Default key on spacy doc where vector similarity score is stored

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py
def __init__(self, *, pos_neg_terms: dict, ft_word_vecs: dict, ft_ngram_vecs: dict):
    if spacy is None:
        raise RuntimeError("spacy module is not installed")

    if np is None or dot is None:
        raise RuntimeError("numpy is not installed")

    self.nlp = spacy.blank("en")
    self.posterms = pos_neg_terms["pos"]
    self.negterms = pos_neg_terms["neg"]
    self.tags = pos_neg_terms["tag"]
    self.lastword = pos_neg_terms["last"]
    self.streetsuff = pos_neg_terms["streetSuff"]
    self.ft_word_vecs = ft_word_vecs
    self.ft_ngram_vecs = ft_ngram_vecs

max_score = 0.8 class-attribute instance-attribute

All fasttext predictions are assigned the max_score

VEC_SIM_SCORE = 'VEC_SIM_SCORE' class-attribute instance-attribute

Default key on spacy doc where vector similarity score is stored

compute_ngrams_bytes(word, min_n, max_n)

From fasttext

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py
def compute_ngrams_bytes(self, word, min_n, max_n):
    """From fasttext"""
    utf8_word = word.encode("utf-8")
    num_bytes = len(utf8_word)
    n = 0
    _MB_MASK = 0xC0
    _MB_START = 0x80
    ngrams = []
    for i in range(num_bytes):
        ngram = []
        if utf8_word[i] & _MB_MASK == _MB_START:
            continue
        j, n = i, 1
        while j < num_bytes and n <= max_n:
            ngram.append(utf8_word[j])
            j += 1
            while j < num_bytes and (utf8_word[j] & _MB_MASK) == _MB_START:
                ngram.append(utf8_word[j])
                j += 1
            if n >= min_n and not (n == 1 and (i == 0 or j == num_bytes)):
                ngrams.append(bytes(ngram))
            n += 1
    return ngrams

ft_hash_bytes(bytez)

Reproduces dictionary used in fastText.

source

https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py
def ft_hash_bytes(self, bytez):
    """Reproduces dictionary used in fastText.

    source:
        https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc
    """
    # Runtime warnings for integer overflow are raised, this is expected
    # behavior. These warnings are suppressed.
    old_settings = np.seterr(all="ignore")
    h = np.uint32(2166136261)
    for b in bytez:
        h = h ^ np.uint32(np.int8(b))
        h = h * np.uint32(16777619)
    np.seterr(**old_settings)
    return h

norm(vec) staticmethod

Normalize vector

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py
@staticmethod
def norm(vec):
    """Normalize vector"""
    length = 1.0 * np.sqrt(np.sum(vec**2))
    if length == 0:
        return vec
    else:
        return vec / length

get_ft_vec(word)

Get FastText vector for a word. If it's OOV, gather the vectors for it's ngrams and average them

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py
def get_ft_vec(self, word):
    """
    Get FastText vector for a word.  If it's OOV, gather the
    vectors for it's ngrams and average them
    """
    vec = [0] * 100
    # If FT already knows about this word, go ahead and get it's vector
    if word in self.ft_word_vecs:
        vec = self.ft_word_vecs[word]
    else:
        # Otherwise, first compute the ngrams of the word
        encoded_ngrams = self.compute_ngrams_bytes(word, 3, 3)

        # Translate the ngrams into into an index
        ngram_hashes = [self.ft_hash_bytes(n) % 100000 for n in encoded_ngrams]
        if len(ngram_hashes) == 0:
            return vec
        # For each ngram index, get the precomputed vector embedding,
        # sum the vectors and get the average which results in a vector
        # for this OOV word
        for nh in ngram_hashes:
            vec += self.ft_ngram_vecs[nh]
        vec = self.norm(vec / len(ngram_hashes))

    return vec

vec_sim(a, b)

Compute the cosing similarity between two vectors

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py
def vec_sim(self, a, b):
    """Compute the cosing similarity between two vectors"""
    vec1 = np.array(self.get_ft_vec(a))
    vec2 = np.array(self.get_ft_vec(b))
    # return dot(self.norm(vec1), self.norm(vec2))
    return dot(vec1, vec2)

ent_score(doc, ent) staticmethod

Determine NERPrediction.score. FT models should all have the same max_score

Source code in src/nemo_safe_synthesizer/pii_replacer/ner/fasttext.py
@staticmethod
def ent_score(doc, ent) -> float:
    """Determine NERPrediction.score. FT models should all have the same max_score"""
    return FTEntityMatcher.max_score