Merge branch 'main' into biz-name-jb-8
# Conflicts: # processors/corp_owners.py
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -6,4 +6,5 @@ tmp/
|
||||
*.org~
|
||||
data/
|
||||
venv/
|
||||
*~
|
||||
*~
|
||||
.env
|
||||
@ -11,7 +11,6 @@ Relevant but not 1-1 walkthrough of how to programmatically find building owners
|
||||
## Project Structure
|
||||
|
||||
- `processors/`: directory for Python scripts to extract, transform, and load the data
|
||||
- `postgis-data/`: directory that will hold the PostgreSQL data
|
||||
- `data/`: currently ignored by git, need to share manually
|
||||
- `inputs/`: directory with only files that come directly from APIs or public websites
|
||||
- `intermediates/`: directory for folders containing intermediate transformed versions of the inputs
|
||||
@ -27,6 +26,7 @@ TODO: Find a good source for eviction filing data. Those with access can refer t
|
||||
|
||||
## Object Storage
|
||||
|
||||
### King County Assessor Data
|
||||
An S3 compatible storage is hosted on [minio.radmin.live](minio.radmin.live)
|
||||
|
||||
SDK documentation: https://github.com/minio/minio-py/blob/master/docs/API.md
|
||||
@ -36,3 +36,6 @@ Use `lib/minio_helper.py` to extend the functionality
|
||||
Run `test_minio` in `lib/main.py` to test out that it works (TODO: move this to own testing script, perhaps unit tests)
|
||||
|
||||
Note: You will need to have minio_access_key and minio_secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys)
|
||||
|
||||
### CCFS Data
|
||||
We have our own copy of the [CCFS database](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research). Contact @linnealovespie or @jessib to get the `.env` file needed to load in the connection string secrets.
|
||||
Binary file not shown.
@ -14,155 +14,90 @@ import json
|
||||
import os
|
||||
import re
|
||||
import urllib.parse
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
search_for_business_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'
|
||||
# Old search URL, holding onto in case the above gets blocked
|
||||
# search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'
|
||||
principal_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList'
|
||||
|
||||
principal_headers = {
|
||||
'Accept-Language': 'en-US,en;q=0.8,es-AR;q=0.5,es;q=0.3',
|
||||
'Referer': 'https://ccfs.sos.wa.gov/',
|
||||
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', # this might be an issue
|
||||
'Origin': 'https://ccfs.sos.wa.gov'
|
||||
}
|
||||
|
||||
def get_business_search_payload(business_name, page_count, page_num):
|
||||
return {
|
||||
'Type': 'BusinessName',
|
||||
'SearchEntityName': business_name,
|
||||
'SearchType': 'BusinessName',
|
||||
'SortType': 'ASC',
|
||||
'SortBy': 'Entity Name',
|
||||
'SearchValue': business_name,
|
||||
'SearchCriteria': 'Contains',
|
||||
'IsSearch': 'true',
|
||||
'PageID': page_num,
|
||||
'PageCount': page_count,
|
||||
}
|
||||
|
||||
def get_business_details(business_id):
|
||||
""" Get business details from the Corporation and charities filing database.
|
||||
"""
|
||||
url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}"
|
||||
# Old search URL, holding onto in case the above gets blocked
|
||||
# url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
|
||||
if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")):
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
return json.load(f)
|
||||
else:
|
||||
r = requests.get(url)
|
||||
# Try to read the response text
|
||||
try:
|
||||
r_json = json.loads(r.text)
|
||||
except:
|
||||
r_json = {}
|
||||
|
||||
try:
|
||||
# TODO: Will this write an empty string if no actual request result?
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
str_json = json.dumps(r_json)
|
||||
f.write(str_json)
|
||||
except:
|
||||
pass
|
||||
return r_json
|
||||
load_dotenv()
|
||||
|
||||
DB_NAME = os.environ.get("DB_NAME")
|
||||
DB_USER = os.environ.get("DB_USER")
|
||||
DB_PASS = os.environ.get("DB_PASS")
|
||||
DB_HOST = os.environ.get("DB_HOST")
|
||||
DB_PORT = os.environ.get("DB_PORT")
|
||||
|
||||
class LookupCompaniesHelper:
|
||||
def __init__(self, out_path: str):
|
||||
self.output_path = out_path # Absolute path to where the file will be saved
|
||||
|
||||
def _get_empty_df(self):
|
||||
return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId',
|
||||
'Address', 'Status', 'address_match'])
|
||||
return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName','address_match'])
|
||||
|
||||
def _get_business_search_results(self, business_name_orig, page_num):
|
||||
def _query_db(self,business_name):
|
||||
conn = psycopg2.connect(database=DB_NAME,
|
||||
user=DB_USER,
|
||||
password=DB_PASS,
|
||||
host=DB_HOST,
|
||||
port=DB_PORT)
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
select
|
||||
corporations."Ubi",
|
||||
corporations."BusinessName",
|
||||
corporations."Type",
|
||||
corporations."TypeDescription",
|
||||
corporations."RecordStatus",
|
||||
business_info."MailingAddressLine1",
|
||||
business_info."MailingCity",
|
||||
business_info."MailingState",
|
||||
business_info."MailingCountry",
|
||||
business_info."MailingZip5"
|
||||
from business_info
|
||||
inner join corporations
|
||||
on business_info."Ubi" = corporations."Ubi"
|
||||
where corporations."BusinessName" ~ %s
|
||||
limit 10
|
||||
;
|
||||
""", (business_name+"*",))
|
||||
rows = cur.fetchall()
|
||||
|
||||
row_names = ["Ubi", "BusinessName","Type","TypeDescription","RecordStatus", "MailingAddressLine1", "MailingCity","MailingState","MailingCountry","MailingZip5"]
|
||||
table = []
|
||||
for r in rows:
|
||||
table += [dict(zip(row_names, r))]
|
||||
return table
|
||||
|
||||
def _get_business_search_results(self, business_name_orig):
|
||||
business_name = business_name_orig.strip()
|
||||
no_result = True
|
||||
result = {}
|
||||
while no_result and len(business_name) > 0:
|
||||
print(f"searching with name {business_name}")
|
||||
r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num))
|
||||
# TODO: add some more error handling in case of connectivity issues.
|
||||
if r.status_code == 429:
|
||||
# TODO: Raise an error
|
||||
print("This IP address has likely been blocked by CCFS, try using a vpn")
|
||||
result = json.loads(r.text)
|
||||
if len(result) > 0:
|
||||
result = self._query_db(business_name)
|
||||
|
||||
# If no search results, try removing the last word in the name
|
||||
# This seems to be a decent heuristic because final words are things like LTD, APTS
|
||||
# TODO: A more robust search function could make this irrelevant
|
||||
if(len(result) > 0):
|
||||
no_result = False
|
||||
else:
|
||||
# Strip off the last word from the search term and try again next iteration
|
||||
try:
|
||||
# Get the index of the last space in the name
|
||||
last_space = business_name[::-1].index(" ")
|
||||
business_name = business_name[: -1 - last_space].strip()
|
||||
except ValueError:
|
||||
# TODO: In this case, try with the LastBuyer in stead of ListedOwner?
|
||||
print(f"Found no business with name {business_name_orig}\n")
|
||||
business_name = ""
|
||||
|
||||
|
||||
return result
|
||||
# Get the index of the last space in the name
|
||||
last_space = business_name[::-1].index(" ")
|
||||
business_name = business_name[: -1 - last_space].strip()
|
||||
|
||||
def _extract_search_results(self, search_term, search_req_response):
|
||||
res_list = []
|
||||
for res in search_req_response:
|
||||
# build up the known responses
|
||||
# get more business data from that id
|
||||
business_info = get_business_details(res["BusinessID"])
|
||||
res_list += [[search_term.strip(),
|
||||
res.get('BusinessName').strip(),
|
||||
res.get('UBINumber'),
|
||||
res.get('BusinessID'),
|
||||
res.get('PrincipalOffice')['PrincipalStreetAddress']['FullAddress'],
|
||||
res.get("BusinessStatus"),
|
||||
business_info.get("BINAICSCodeDesc", "NOT_FOUND")]]
|
||||
# return an empty row if no search results
|
||||
if len(search_req_response) == 0:
|
||||
res_list += [[search_term, "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND"]]
|
||||
|
||||
res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status", "BusinessNature"])
|
||||
|
||||
# Clean some of the results a bit more:
|
||||
# Keep only active companies and searches that yielded no results
|
||||
res_df = res_df[(res_df["Status"]=="Active") | (res_df["Status"]=="NOT_FOUND")]
|
||||
|
||||
# TODO: Maybe add a filter on BusinessNature for only real estate/ property investments
|
||||
# TODO: First need to get an idea of all the BusinessNature types
|
||||
|
||||
# Keep a list of exact matches, or later build a list of potential matches that we give to human verifiers
|
||||
# This check is very simple heuristic and more robust matching will occur later in processing
|
||||
exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist()
|
||||
if exact_match:
|
||||
res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0)
|
||||
|
||||
return res_df
|
||||
|
||||
def _determine_search_matches(self, search_results_df):
|
||||
"""
|
||||
Mark row as potential match: UBI number is a duplicate, or Address is the same
|
||||
df.duplicated just sees if that address is already in the dataframe, NOT that the serach term
|
||||
and result have the same address. Could add search terms as a subset for duplicated call
|
||||
"""
|
||||
search_results_df['address_match'] = search_results_df.duplicated(subset=['Address'], keep=False)
|
||||
|
||||
def _get_all_company_name_match_search_results(self, owner_name):
|
||||
n = 1
|
||||
res_length = 100
|
||||
search_results = []
|
||||
|
||||
res = self._get_business_search_results(owner_name, n)
|
||||
return res
|
||||
df = pd.DataFrame(result)
|
||||
df["SearchTerm"] = business_name_orig
|
||||
return df
|
||||
|
||||
"""
|
||||
"""
|
||||
def _get_potential_company_name_matches(self, owner_name):
|
||||
all_search_results = self._get_all_company_name_match_search_results(owner_name)
|
||||
extracted_results = self._extract_search_results(owner_name, all_search_results)
|
||||
self._determine_search_matches(extracted_results)
|
||||
return extracted_results
|
||||
def _get_potential_company_name_matches(self, owner_name):
|
||||
all_search_results = self._get_business_search_results(owner_name)
|
||||
df = pd.DataFrame(all_search_results)
|
||||
df["SearchTerm"] = owner_name
|
||||
return df
|
||||
|
||||
def _separate_search_results(self, results):
|
||||
def _separate_search_results(self, results, searchTerm):
|
||||
"""
|
||||
utils to separate search results into exact match, potential match (where no exact match was found),
|
||||
and additional matches (extra matches if there was an exact match and additional matches)
|
||||
@ -211,16 +146,16 @@ class LookupCompaniesHelper:
|
||||
|
||||
return term.strip()
|
||||
|
||||
def is_exact_match(row):
|
||||
def is_exact_match(row, searchTerm):
|
||||
""" Extract exact matches, including some regex magic. """
|
||||
search = row["SearchTerm"]
|
||||
search = searchTerm
|
||||
result = row["BusinessName"]
|
||||
return normalize_name(search) == normalize_name(result)
|
||||
|
||||
exact_matches = self._get_empty_df()
|
||||
potential_matches = self._get_empty_df()
|
||||
|
||||
exact_match = results[results.apply(lambda row: is_exact_match(row), axis=1)]
|
||||
exact_match = results[results.apply(lambda row: is_exact_match(row, searchTerm), axis=1)]
|
||||
# TODO: If going to do len(results) check, then need to filter by business nature sooner
|
||||
# Len results heuristic doesn't work for empty searches, or the recursive search
|
||||
if len(exact_match) > 0: #or len(results) == 1:
|
||||
@ -243,8 +178,8 @@ class LookupCompaniesHelper:
|
||||
|
||||
for owner in owner_list:
|
||||
owner = owner.strip() # Clean owner name slightly
|
||||
matches = self._get_potential_company_name_matches(owner)
|
||||
temp_exact, temp_potential = self._separate_search_results(matches)
|
||||
matches = self._get_business_search_results(owner)
|
||||
temp_exact, temp_potential = self._separate_search_results(matches, owner)
|
||||
exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True)
|
||||
potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True)
|
||||
return exact_matches, potential_matches
|
||||
@ -261,6 +196,33 @@ class LookupCompaniesHelper:
|
||||
exact_matches.to_csv(f'{self.output_path}/exact_matches_{x}.csv')
|
||||
potential_matches.to_csv(f'{self.output_path}/potential_matches_{x}.csv')
|
||||
|
||||
def get_business_details(business_id):
|
||||
""" Get business details from the Corporation and charities filing database.
|
||||
"""
|
||||
url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}"
|
||||
# Old search URL, holding onto in case the above gets blocked
|
||||
# url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
|
||||
if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")):
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
return json.load(f)
|
||||
else:
|
||||
r = requests.get(url)
|
||||
# Try to read the response text
|
||||
try:
|
||||
r_json = json.loads(r.text)
|
||||
except:
|
||||
r_json = {}
|
||||
|
||||
try:
|
||||
# TODO: Will this write an empty string if no actual request result?
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
str_json = json.dumps(r_json)
|
||||
f.write(str_json)
|
||||
except:
|
||||
pass
|
||||
return r_json
|
||||
|
||||
# Not currently in use, needs to be updated
|
||||
class GroupCompaniesHelper:
|
||||
def __init__(self, out_path: str, out_name: str):
|
||||
self.output_path = out_path # The path to the output file to save the output file
|
||||
@ -409,4 +371,4 @@ class GroupCompaniesHelper:
|
||||
results.to_csv(f"{self.output_path}/{self.output_name}")
|
||||
|
||||
results.to_csv(f"{self.output_path}/{self.output_name}")
|
||||
return results
|
||||
return results
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
#!/usr/bin/env python3get_company_list_name_matches
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Aug 15 19:06:45 2025
|
||||
|
||||
@author: linnea
|
||||
|
||||
Script to find exact and potential search results for a parcel owner in the CCFS database
|
||||
|
||||
A representative example for the parcel owner (assessor) data scraping step
|
||||
Address: 308 4th Ave S, Seattle, WA, 98104
|
||||
ParcelNumber: 5247801370
|
||||
ListedOwner: GRE DOWNTOWNER LLC
|
||||
PreviousBuyer: CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY
|
||||
|
||||
We happen to already know the answer,
|
||||
which is this address is part of Goodman Real Estate's extensive portfolio
|
||||
GRE List: https://goodmanre.com/our-projects/
|
||||
|
||||
TODO:
|
||||
- Make a flag that shows if the buywer / owner are similar
|
||||
- Get the address field from CCFS, put in corp_owners
|
||||
- If the previous buyer doesn't make sense,
|
||||
get the year of the last buying to see if it's at all recent for sanity checks
|
||||
"""
|
||||
|
||||
from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper
|
||||
import pandas as pd
|
||||
|
||||
lookup_helper = LookupCompaniesHelper(("../data/intermediates"))
|
||||
|
||||
# Option 1: Uncomment the two lines to run the full script.
|
||||
# df = pd.read_csv("../data/intermediates/owners_listed.csv")
|
||||
# owner_names = df["ListedOwner"].unique()
|
||||
|
||||
# Option 2: Uncomment two lines to run with a specific subset for debugging
|
||||
df = pd.read_excel("../experiments/gre_apartments.ods", engine='odf')
|
||||
df = df.iloc[1]
|
||||
owner_names = [df["ListedOwner"]]
|
||||
|
||||
exact, potential = lookup_helper.get_company_list_name_matches(owner_names)
|
||||
|
||||
|
||||
25
processors/test.py
Normal file
25
processors/test.py
Normal file
@ -0,0 +1,25 @@
|
||||
"""
|
||||
Helper script for testing out changes to business lookup.
|
||||
Uses the GRE data that we were able to collect by hanad for verification.
|
||||
"""
|
||||
|
||||
from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
lookup_helper = LookupCompaniesHelper(("../data/intermediates"))
|
||||
|
||||
print(os.getcwd())
|
||||
df = pd.read_excel("./experiments/gre_apartments.ods", engine='odf')
|
||||
|
||||
# Option 1: iterate through the whole list of GRE apartment names
|
||||
owner_names = df["ListedOwner"].unique()
|
||||
|
||||
# Option 2: pick a specific owner name
|
||||
# owner_names = ["GRE 4TH AVE S LLC"]
|
||||
|
||||
|
||||
exact, potential = lookup_helper.get_company_list_name_matches(owner_names)
|
||||
|
||||
exact.to_csv("./data/intermediates/exact.csv")
|
||||
potential.to_csv("./data/intermediates/potential.csv")
|
||||
@ -30,3 +30,6 @@ argon2-cffi-bindings==25.1.0
|
||||
cffi==2.0.0
|
||||
minio==7.2.16
|
||||
pycryptodome==3.23.0
|
||||
dotenv==0.9.9
|
||||
python-dotenv==1.2.1
|
||||
psycopg2-binary==2.9.10
|
||||
|
||||
Reference in New Issue
Block a user