CCFS Database #18
3
.gitignore
vendored
3
.gitignore
vendored
@ -6,4 +6,5 @@ tmp/
|
||||
*.org~
|
||||
data/
|
||||
venv/
|
||||
*~
|
||||
*~
|
||||
.env
|
||||
@ -11,7 +11,6 @@ Relevant but not 1-1 walkthrough of how to programmatically find building owners
|
||||
## Project Structure
|
||||
|
||||
- `processors/`: directory for Python scripts to extract, transform, and load the data
|
||||
- `postgis-data/`: directory that will hold the PostgreSQL data
|
||||
- `data/`: currently ignored by git, need to share manually
|
||||
- `inputs/`: directory with only files that come directly from APIs or public websites
|
||||
- `intermediates/`: directory for folders containing intermediate transformed versions of the inputs
|
||||
@ -27,6 +26,7 @@ TODO: Find a good source for eviction filing data. Those with access can refer t
|
||||
|
||||
## Object Storage
|
||||
|
||||
### King County Assessor Data
|
||||
An S3 compatible storage is hosted on [minio.radmin.live](minio.radmin.live)
|
||||
|
||||
SDK documentation: https://github.com/minio/minio-py/blob/master/docs/API.md
|
||||
@ -36,3 +36,6 @@ Use `lib/minio_helper.py` to extend the functionality
|
||||
Run `test_minio` in `lib/main.py` to test out that it works (TODO: move this to own testing script, perhaps unit tests)
|
||||
|
||||
Note: You will need to have minio_access_key and minio_secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys)
|
||||
|
||||
### CCFS Data
|
||||
We have our own copy of the [CCFS database](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research). Contact @linnealovespie or @jessib to get the `.env` file needed to load in the connection string secrets.
|
||||
Binary file not shown.
@ -14,155 +14,90 @@ import json
|
||||
import os
|
||||
import re
|
||||
import urllib.parse
|
||||
import psycopg2
|
||||
from dotenv import load_dotenv
|
||||
|
||||
search_for_business_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'
|
||||
# Old search URL, holding onto in case the above gets blocked
|
||||
# search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'
|
||||
principal_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList'
|
||||
|
||||
principal_headers = {
|
||||
'Accept-Language': 'en-US,en;q=0.8,es-AR;q=0.5,es;q=0.3',
|
||||
'Referer': 'https://ccfs.sos.wa.gov/',
|
||||
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', # this might be an issue
|
||||
'Origin': 'https://ccfs.sos.wa.gov'
|
||||
}
|
||||
|
||||
def get_business_search_payload(business_name, page_count, page_num):
|
||||
return {
|
||||
'Type': 'BusinessName',
|
||||
'SearchEntityName': business_name,
|
||||
'SearchType': 'BusinessName',
|
||||
'SortType': 'ASC',
|
||||
'SortBy': 'Entity Name',
|
||||
'SearchValue': business_name,
|
||||
'SearchCriteria': 'Contains',
|
||||
'IsSearch': 'true',
|
||||
'PageID': page_num,
|
||||
'PageCount': page_count,
|
||||
}
|
||||
|
||||
def get_business_details(business_id):
|
||||
""" Get business details from the Corporation and charities filing database.
|
||||
"""
|
||||
url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}"
|
||||
# Old search URL, holding onto in case the above gets blocked
|
||||
# url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
|
||||
if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")):
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
return json.load(f)
|
||||
else:
|
||||
r = requests.get(url)
|
||||
# Try to read the response text
|
||||
try:
|
||||
r_json = json.loads(r.text)
|
||||
except:
|
||||
r_json = {}
|
||||
|
||||
try:
|
||||
# TODO: Will this write an empty string if no actual request result?
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
str_json = json.dumps(r_json)
|
||||
f.write(str_json)
|
||||
except:
|
||||
pass
|
||||
return r_json
|
||||
load_dotenv()
|
||||
|
||||
DB_NAME = os.environ.get("DB_NAME")
|
||||
DB_USER = os.environ.get("DB_USER")
|
||||
DB_PASS = os.environ.get("DB_PASS")
|
||||
DB_HOST = os.environ.get("DB_HOST")
|
||||
DB_PORT = os.environ.get("DB_PORT")
|
||||
|
||||
class LookupCompaniesHelper:
|
||||
def __init__(self, out_path: str):
|
||||
self.output_path = out_path # Absolute path to where the file will be saved
|
||||
|
||||
def _get_empty_df(self):
|
||||
return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId',
|
||||
'Address', 'Status', 'address_match'])
|
||||
return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName','address_match'])
|
||||
|
||||
def _get_business_search_results(self, business_name_orig, page_num):
|
||||
def _query_db(self,business_name):
|
||||
conn = psycopg2.connect(database=DB_NAME,
|
||||
user=DB_USER,
|
||||
password=DB_PASS,
|
||||
host=DB_HOST,
|
||||
port=DB_PORT)
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
select
|
||||
corporations."Ubi",
|
||||
corporations."BusinessName",
|
||||
corporations."Type",
|
||||
corporations."TypeDescription",
|
||||
corporations."RecordStatus",
|
||||
business_info."MailingAddressLine1",
|
||||
business_info."MailingCity",
|
||||
business_info."MailingState",
|
||||
business_info."MailingCountry",
|
||||
business_info."MailingZip5"
|
||||
from business_info
|
||||
inner join corporations
|
||||
on business_info."Ubi" = corporations."Ubi"
|
||||
where corporations."BusinessName" ~ %s
|
||||
limit 10
|
||||
;
|
||||
""", (business_name+"*",))
|
||||
rows = cur.fetchall()
|
||||
|
||||
row_names = ["Ubi", "BusinessName","Type","TypeDescription","RecordStatus", "MailingAddressLine1", "MailingCity","MailingState","MailingCountry","MailingZip5"]
|
||||
table = []
|
||||
for r in rows:
|
||||
table += [dict(zip(row_names, r))]
|
||||
return table
|
||||
|
||||
def _get_business_search_results(self, business_name_orig):
|
||||
business_name = business_name_orig.strip()
|
||||
no_result = True
|
||||
result = {}
|
||||
while no_result and len(business_name) > 0:
|
||||
print(f"searching with name {business_name}")
|
||||
r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num))
|
||||
# TODO: add some more error handling in case of connectivity issues.
|
||||
if r.status_code == 429:
|
||||
# TODO: Raise an error
|
||||
print("This IP address has likely been blocked by CCFS, try using a vpn")
|
||||
result = json.loads(r.text)
|
||||
if len(result) > 0:
|
||||
result = self._query_db(business_name)
|
||||
|
||||
# If no search results, try removing the last word in the name
|
||||
# This seems to be a decent heuristic because final words are things like LTD, APTS
|
||||
# TODO: A more robust search function could make this irrelevant
|
||||
if(len(result) > 0):
|
||||
no_result = False
|
||||
else:
|
||||
# Strip off the last word from the search term and try again next iteration
|
||||
try:
|
||||
# Get the index of the last space in the name
|
||||
last_space = business_name[::-1].index(" ")
|
||||
business_name = business_name[: -1 - last_space].strip()
|
||||
except ValueError:
|
||||
# TODO: In this case, try with the LastBuyer in stead of ListedOwner?
|
||||
print(f"Found no business with name {business_name_orig}\n")
|
||||
business_name = ""
|
||||
|
||||
|
||||
return result
|
||||
# Get the index of the last space in the name
|
||||
last_space = business_name[::-1].index(" ")
|
||||
business_name = business_name[: -1 - last_space].strip()
|
||||
|
||||
def _extract_search_results(self, search_term, search_req_response):
|
||||
res_list = []
|
||||
for res in search_req_response:
|
||||
# build up the known responses
|
||||
# get more business data from that id
|
||||
business_info = get_business_details(res["BusinessID"])
|
||||
res_list += [[search_term.strip(),
|
||||
res.get('BusinessName').strip(),
|
||||
res.get('UBINumber'),
|
||||
res.get('BusinessID'),
|
||||
res.get('PrincipalOffice')['PrincipalStreetAddress']['FullAddress'],
|
||||
res.get("BusinessStatus"),
|
||||
business_info.get("BINAICSCodeDesc", "NOT_FOUND")]]
|
||||
# return an empty row if no search results
|
||||
if len(search_req_response) == 0:
|
||||
res_list += [[search_term, "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND"]]
|
||||
|
||||
res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status", "BusinessNature"])
|
||||
|
||||
# Clean some of the results a bit more:
|
||||
# Keep only active companies and searches that yielded no results
|
||||
res_df = res_df[(res_df["Status"]=="Active") | (res_df["Status"]=="NOT_FOUND")]
|
||||
|
||||
# TODO: Maybe add a filter on BusinessNature for only real estate/ property investments
|
||||
# TODO: First need to get an idea of all the BusinessNature types
|
||||
|
||||
# Keep a list of exact matches, or later build a list of potential matches that we give to human verifiers
|
||||
# This check is very simple heuristic and more robust matching will occur later in processing
|
||||
exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist()
|
||||
if exact_match:
|
||||
res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0)
|
||||
|
||||
return res_df
|
||||
|
||||
def _determine_search_matches(self, search_results_df):
|
||||
"""
|
||||
Mark row as potential match: UBI number is a duplicate, or Address is the same
|
||||
df.duplicated just sees if that address is already in the dataframe, NOT that the serach term
|
||||
and result have the same address. Could add search terms as a subset for duplicated call
|
||||
"""
|
||||
search_results_df['address_match'] = search_results_df.duplicated(subset=['Address'], keep=False)
|
||||
|
||||
def _get_all_company_name_match_search_results(self, owner_name):
|
||||
n = 1
|
||||
res_length = 100
|
||||
search_results = []
|
||||
|
||||
res = self._get_business_search_results(owner_name, n)
|
||||
return res
|
||||
df = pd.DataFrame(result)
|
||||
df["SearchTerm"] = business_name_orig
|
||||
return df
|
||||
|
||||
"""
|
||||
"""
|
||||
def _get_potential_company_name_matches(self, owner_name):
|
||||
all_search_results = self._get_all_company_name_match_search_results(owner_name)
|
||||
extracted_results = self._extract_search_results(owner_name, all_search_results)
|
||||
self._determine_search_matches(extracted_results)
|
||||
return extracted_results
|
||||
def _get_potential_company_name_matches(self, owner_name):
|
||||
all_search_results = self._get_business_search_results(owner_name)
|
||||
df = pd.DataFrame(all_search_results)
|
||||
df["SearchTerm"] = owner_name
|
||||
return df
|
||||
|
||||
def _separate_search_results(self, results):
|
||||
def _separate_search_results(self, results, searchTerm):
|
||||
"""
|
||||
utils to separate search results into exact match, potential match (where no exact match was found),
|
||||
and additional matches (extra matches if there was an exact match and additional matches)
|
||||
@ -171,9 +106,9 @@ class LookupCompaniesHelper:
|
||||
- Partnership
|
||||
- etc.
|
||||
"""
|
||||
def is_exact_match(row):
|
||||
def is_exact_match(row, searchTerm):
|
||||
""" Extract exact matches, including some regex magic. """
|
||||
search = row["SearchTerm"]
|
||||
search = searchTerm
|
||||
result = row["BusinessName"]
|
||||
|
||||
# examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC.
|
||||
@ -201,7 +136,7 @@ class LookupCompaniesHelper:
|
||||
exact_matches = self._get_empty_df()
|
||||
potential_matches = self._get_empty_df()
|
||||
|
||||
exact_match = results[results.apply(lambda row: is_exact_match(row), axis=1)]
|
||||
exact_match = results[results.apply(lambda row: is_exact_match(row, searchTerm), axis=1)]
|
||||
# TODO: If going to do len(results) check, then need to filter by business nature sooner
|
||||
# Len results heuristic doesn't work for empty searches, or the recursive search
|
||||
if len(exact_match) > 0: #or len(results) == 1:
|
||||
@ -224,8 +159,8 @@ class LookupCompaniesHelper:
|
||||
|
||||
for owner in owner_list:
|
||||
owner = owner.strip() # Clean owner name slightly
|
||||
matches = self._get_potential_company_name_matches(owner)
|
||||
temp_exact, temp_potential = self._separate_search_results(matches)
|
||||
matches = self._get_business_search_results(owner)
|
||||
temp_exact, temp_potential = self._separate_search_results(matches, owner)
|
||||
exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True)
|
||||
potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True)
|
||||
return exact_matches, potential_matches
|
||||
@ -242,6 +177,33 @@ class LookupCompaniesHelper:
|
||||
exact_matches.to_csv(f'{self.output_path}/exact_matches_{x}.csv')
|
||||
potential_matches.to_csv(f'{self.output_path}/potential_matches_{x}.csv')
|
||||
|
||||
def get_business_details(business_id):
|
||||
""" Get business details from the Corporation and charities filing database.
|
||||
"""
|
||||
url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}"
|
||||
# Old search URL, holding onto in case the above gets blocked
|
||||
# url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
|
||||
if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")):
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
return json.load(f)
|
||||
else:
|
||||
r = requests.get(url)
|
||||
# Try to read the response text
|
||||
try:
|
||||
r_json = json.loads(r.text)
|
||||
except:
|
||||
r_json = {}
|
||||
|
||||
try:
|
||||
# TODO: Will this write an empty string if no actual request result?
|
||||
with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f:
|
||||
str_json = json.dumps(r_json)
|
||||
f.write(str_json)
|
||||
except:
|
||||
pass
|
||||
return r_json
|
||||
|
||||
# Not currently in use, needs to be updated
|
||||
class GroupCompaniesHelper:
|
||||
def __init__(self, out_path: str, out_name: str):
|
||||
self.output_path = out_path # The path to the output file to save the output file
|
||||
@ -390,4 +352,4 @@ class GroupCompaniesHelper:
|
||||
results.to_csv(f"{self.output_path}/{self.output_name}")
|
||||
|
||||
results.to_csv(f"{self.output_path}/{self.output_name}")
|
||||
return results
|
||||
return results
|
||||
|
||||
@ -1,43 +0,0 @@
|
||||
#!/usr/bin/env python3get_company_list_name_matches
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Aug 15 19:06:45 2025
|
||||
|
||||
@author: linnea
|
||||
|
||||
Script to find exact and potential search results for a parcel owner in the CCFS database
|
||||
|
||||
A representative example for the parcel owner (assessor) data scraping step
|
||||
Address: 308 4th Ave S, Seattle, WA, 98104
|
||||
ParcelNumber: 5247801370
|
||||
ListedOwner: GRE DOWNTOWNER LLC
|
||||
PreviousBuyer: CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY
|
||||
|
||||
We happen to already know the answer,
|
||||
which is this address is part of Goodman Real Estate's extensive portfolio
|
||||
GRE List: https://goodmanre.com/our-projects/
|
||||
|
||||
TODO:
|
||||
- Make a flag that shows if the buywer / owner are similar
|
||||
- Get the address field from CCFS, put in corp_owners
|
||||
- If the previous buyer doesn't make sense,
|
||||
get the year of the last buying to see if it's at all recent for sanity checks
|
||||
"""
|
||||
|
||||
from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper
|
||||
import pandas as pd
|
||||
|
||||
lookup_helper = LookupCompaniesHelper(("../data/intermediates"))
|
||||
|
||||
# Option 1: Uncomment the two lines to run the full script.
|
||||
# df = pd.read_csv("../data/intermediates/owners_listed.csv")
|
||||
# owner_names = df["ListedOwner"].unique()
|
||||
|
||||
# Option 2: Uncomment two lines to run with a specific subset for debugging
|
||||
df = pd.read_excel("../experiments/gre_apartments.ods", engine='odf')
|
||||
df = df.iloc[1]
|
||||
owner_names = [df["ListedOwner"]]
|
||||
|
||||
exact, potential = lookup_helper.get_company_list_name_matches(owner_names)
|
||||
|
||||
|
||||
25
processors/test.py
Normal file
25
processors/test.py
Normal file
@ -0,0 +1,25 @@
|
||||
"""
|
||||
Helper script for testing out changes to business lookup.
|
||||
Uses the GRE data that we were able to collect by hanad for verification.
|
||||
"""
|
||||
|
||||
from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
lookup_helper = LookupCompaniesHelper(("../data/intermediates"))
|
||||
|
||||
print(os.getcwd())
|
||||
df = pd.read_excel("./experiments/gre_apartments.ods", engine='odf')
|
||||
|
||||
# Option 1: iterate through the whole list of GRE apartment names
|
||||
owner_names = df["ListedOwner"].unique()
|
||||
|
||||
# Option 2: pick a specific owner name
|
||||
# owner_names = ["GRE 4TH AVE S LLC"]
|
||||
|
||||
|
||||
exact, potential = lookup_helper.get_company_list_name_matches(owner_names)
|
||||
|
||||
exact.to_csv("./data/intermediates/exact.csv")
|
||||
potential.to_csv("./data/intermediates/potential.csv")
|
||||
@ -30,3 +30,6 @@ argon2-cffi-bindings==25.1.0
|
||||
cffi==2.0.0
|
||||
minio==7.2.16
|
||||
pycryptodome==3.23.0
|
||||
dotenv==0.9.9
|
||||
python-dotenv==1.2.1
|
||||
|
linnealovespie marked this conversation as resolved
|
||||
psycopg2-binary==2.9.10
|
||||
|
||||
Reference in New Issue
Block a user
Should
psycopg2be added here? I needed to manually install but maybe that was something with my setup.Yes it should good catch. I'd like to add another todo later to automate this process because slip ups like this happen, but my
pip freezecommand looks funky when trying to generate this file.