diff --git a/processors/corp_owners.py b/processors/corp_owners.py index f745cba..6d2fc0d 100644 --- a/processors/corp_owners.py +++ b/processors/corp_owners.py @@ -106,33 +106,52 @@ class LookupCompaniesHelper: - Partnership - etc. """ + def normalize_name(term): + # add space at at end of beginning to simplify matching words without using regexp (and then will remove) + term = " " + term.upper() + " " + term = re.sub(r"\s+", " ", term) + + # examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC. + # This requires space before and after + p = re.compile(r" L[\s.]?L[\s,.]?[PC][.]? ") + term = re.sub(p, "LLC", term) + + term = term.replace(",", "") + + word_replace_map = { + "LIMITED LIABILITY COMPANY": "LLC", + "LIMITED PARTNERSHIP": "LLC", + "APARTMENTS": "APTS", + "LTD PS": "LLC", + "LTD PARTNERSHIP": "LLC", + "ST": "STREET", + "AVE": "AVENUE", + "BLVD": "BOULEVARD", + "PRPTS": "PROPERTIES", + "PPTY": "PROPERTY", + "BLDG": "BUILDING", + "HLDGS": "HOLDINGS", + "GRP": "GROUP", + "INVSTMNTS": "INVESTMENTS", + "FMLY": "FAMILY", + "CO": "COMPANY", + "CORP": "CORPORATION", + "&": "AND", + "APT": "APARTMENT", + "APTS": "APARTMENTS", + } + + for k,v in word_replace_map.items(): + term = term.replace(" " + k + " ", " " + v + " ") + + return term.strip() + def is_exact_match(row, searchTerm): """ Extract exact matches, including some regex magic. """ search = searchTerm result = row["BusinessName"] - - # examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC. - # Limited Partnership, Limited liability company - p = re.compile("L[\s.]?L[\s,.]?[PC][.]" ,flags=re.IGNORECASE) - - replace_map = { - ",": "", - "LIMITED LIABILITY COMPANY":"LLC", - "LIMITED PARTNERSHIP": "LLC", - "APARTMENTS": "APTS", - "LTD PS": "LLC", - "LTD PARTNERSHIP": "LLC", - } - - result= re.sub(p, "LLC", result) - search=re.sub(p, "LLC", search) + return normalize_name(search) == normalize_name(result) - for k,v in replace_map.items(): - result = result.replace(k, v) - search = search.replace(k, v) - - return search == result - exact_matches = self._get_empty_df() potential_matches = self._get_empty_df()