diff --git a/.gitignore b/.gitignore index 09833da..601dce3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ tmp/ *.org~ data/ venv/ -*~ \ No newline at end of file +*~ +.env \ No newline at end of file diff --git a/README.md b/README.md index 0e3373a..a7bd07d 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ Relevant but not 1-1 walkthrough of how to programmatically find building owners ## Project Structure - `processors/`: directory for Python scripts to extract, transform, and load the data -- `postgis-data/`: directory that will hold the PostgreSQL data - `data/`: currently ignored by git, need to share manually - `inputs/`: directory with only files that come directly from APIs or public websites - `intermediates/`: directory for folders containing intermediate transformed versions of the inputs @@ -27,6 +26,7 @@ TODO: Find a good source for eviction filing data. Those with access can refer t ## Object Storage +### King County Assessor Data An S3 compatible storage is hosted on [minio.radmin.live](minio.radmin.live) SDK documentation: https://github.com/minio/minio-py/blob/master/docs/API.md @@ -36,3 +36,6 @@ Use `lib/minio_helper.py` to extend the functionality Run `test_minio` in `lib/main.py` to test out that it works (TODO: move this to own testing script, perhaps unit tests) Note: You will need to have minio_access_key and minio_secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys) + +### CCFS Data +We have our own copy of the [CCFS database](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research). Contact @linnealovespie or @jessib to get the `.env` file needed to load in the connection string secrets. \ No newline at end of file diff --git a/experiments/gre_apartments.ods b/experiments/gre_apartments.ods index e3c4516..514125a 100644 Binary files a/experiments/gre_apartments.ods and b/experiments/gre_apartments.ods differ diff --git a/processors/corp_owners.py b/processors/corp_owners.py index 56f40b8..f745cba 100644 --- a/processors/corp_owners.py +++ b/processors/corp_owners.py @@ -14,155 +14,90 @@ import json import os import re import urllib.parse +import psycopg2 +from dotenv import load_dotenv -search_for_business_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' -# Old search URL, holding onto in case the above gets blocked -# search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' -principal_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList' - -principal_headers = { - 'Accept-Language': 'en-US,en;q=0.8,es-AR;q=0.5,es;q=0.3', - 'Referer': 'https://ccfs.sos.wa.gov/', - 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', # this might be an issue - 'Origin': 'https://ccfs.sos.wa.gov' -} - -def get_business_search_payload(business_name, page_count, page_num): - return { - 'Type': 'BusinessName', - 'SearchEntityName': business_name, - 'SearchType': 'BusinessName', - 'SortType': 'ASC', - 'SortBy': 'Entity Name', - 'SearchValue': business_name, - 'SearchCriteria': 'Contains', - 'IsSearch': 'true', - 'PageID': page_num, - 'PageCount': page_count, - } - -def get_business_details(business_id): - """ Get business details from the Corporation and charities filing database. - """ - url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}" - # Old search URL, holding onto in case the above gets blocked - # url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id) - if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")): - with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: - return json.load(f) - else: - r = requests.get(url) - # Try to read the response text - try: - r_json = json.loads(r.text) - except: - r_json = {} - - try: - # TODO: Will this write an empty string if no actual request result? - with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: - str_json = json.dumps(r_json) - f.write(str_json) - except: - pass - return r_json +load_dotenv() +DB_NAME = os.environ.get("DB_NAME") +DB_USER = os.environ.get("DB_USER") +DB_PASS = os.environ.get("DB_PASS") +DB_HOST = os.environ.get("DB_HOST") +DB_PORT = os.environ.get("DB_PORT") class LookupCompaniesHelper: def __init__(self, out_path: str): self.output_path = out_path # Absolute path to where the file will be saved def _get_empty_df(self): - return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', - 'Address', 'Status', 'address_match']) + return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName','address_match']) - def _get_business_search_results(self, business_name_orig, page_num): + def _query_db(self,business_name): + conn = psycopg2.connect(database=DB_NAME, + user=DB_USER, + password=DB_PASS, + host=DB_HOST, + port=DB_PORT) + conn.autocommit = True + with conn.cursor() as cur: + cur.execute(""" + select + corporations."Ubi", + corporations."BusinessName", + corporations."Type", + corporations."TypeDescription", + corporations."RecordStatus", + business_info."MailingAddressLine1", + business_info."MailingCity", + business_info."MailingState", + business_info."MailingCountry", + business_info."MailingZip5" + from business_info + inner join corporations + on business_info."Ubi" = corporations."Ubi" + where corporations."BusinessName" ~ %s + limit 10 + ; + """, (business_name+"*",)) + rows = cur.fetchall() + + row_names = ["Ubi", "BusinessName","Type","TypeDescription","RecordStatus", "MailingAddressLine1", "MailingCity","MailingState","MailingCountry","MailingZip5"] + table = [] + for r in rows: + table += [dict(zip(row_names, r))] + return table + + def _get_business_search_results(self, business_name_orig): business_name = business_name_orig.strip() no_result = True result = {} while no_result and len(business_name) > 0: print(f"searching with name {business_name}") - r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num)) - # TODO: add some more error handling in case of connectivity issues. - if r.status_code == 429: - # TODO: Raise an error - print("This IP address has likely been blocked by CCFS, try using a vpn") - result = json.loads(r.text) - if len(result) > 0: + result = self._query_db(business_name) + + # If no search results, try removing the last word in the name + # This seems to be a decent heuristic because final words are things like LTD, APTS + # TODO: A more robust search function could make this irrelevant + if(len(result) > 0): no_result = False else: - # Strip off the last word from the search term and try again next iteration - try: - # Get the index of the last space in the name - last_space = business_name[::-1].index(" ") - business_name = business_name[: -1 - last_space].strip() - except ValueError: - # TODO: In this case, try with the LastBuyer in stead of ListedOwner? - print(f"Found no business with name {business_name_orig}\n") - business_name = "" - - - return result + # Get the index of the last space in the name + last_space = business_name[::-1].index(" ") + business_name = business_name[: -1 - last_space].strip() - def _extract_search_results(self, search_term, search_req_response): - res_list = [] - for res in search_req_response: - # build up the known responses - # get more business data from that id - business_info = get_business_details(res["BusinessID"]) - res_list += [[search_term.strip(), - res.get('BusinessName').strip(), - res.get('UBINumber'), - res.get('BusinessID'), - res.get('PrincipalOffice')['PrincipalStreetAddress']['FullAddress'], - res.get("BusinessStatus"), - business_info.get("BINAICSCodeDesc", "NOT_FOUND")]] - # return an empty row if no search results - if len(search_req_response) == 0: - res_list += [[search_term, "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND"]] - - res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status", "BusinessNature"]) - - # Clean some of the results a bit more: - # Keep only active companies and searches that yielded no results - res_df = res_df[(res_df["Status"]=="Active") | (res_df["Status"]=="NOT_FOUND")] - - # TODO: Maybe add a filter on BusinessNature for only real estate/ property investments - # TODO: First need to get an idea of all the BusinessNature types - - # Keep a list of exact matches, or later build a list of potential matches that we give to human verifiers - # This check is very simple heuristic and more robust matching will occur later in processing - exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist() - if exact_match: - res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0) - - return res_df - - def _determine_search_matches(self, search_results_df): - """ - Mark row as potential match: UBI number is a duplicate, or Address is the same - df.duplicated just sees if that address is already in the dataframe, NOT that the serach term - and result have the same address. Could add search terms as a subset for duplicated call - """ - search_results_df['address_match'] = search_results_df.duplicated(subset=['Address'], keep=False) - - def _get_all_company_name_match_search_results(self, owner_name): - n = 1 - res_length = 100 - search_results = [] - - res = self._get_business_search_results(owner_name, n) - return res + df = pd.DataFrame(result) + df["SearchTerm"] = business_name_orig + return df """ """ - def _get_potential_company_name_matches(self, owner_name): - all_search_results = self._get_all_company_name_match_search_results(owner_name) - extracted_results = self._extract_search_results(owner_name, all_search_results) - self._determine_search_matches(extracted_results) - return extracted_results + def _get_potential_company_name_matches(self, owner_name): + all_search_results = self._get_business_search_results(owner_name) + df = pd.DataFrame(all_search_results) + df["SearchTerm"] = owner_name + return df - def _separate_search_results(self, results): + def _separate_search_results(self, results, searchTerm): """ utils to separate search results into exact match, potential match (where no exact match was found), and additional matches (extra matches if there was an exact match and additional matches) @@ -171,9 +106,9 @@ class LookupCompaniesHelper: - Partnership - etc. """ - def is_exact_match(row): + def is_exact_match(row, searchTerm): """ Extract exact matches, including some regex magic. """ - search = row["SearchTerm"] + search = searchTerm result = row["BusinessName"] # examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC. @@ -201,7 +136,7 @@ class LookupCompaniesHelper: exact_matches = self._get_empty_df() potential_matches = self._get_empty_df() - exact_match = results[results.apply(lambda row: is_exact_match(row), axis=1)] + exact_match = results[results.apply(lambda row: is_exact_match(row, searchTerm), axis=1)] # TODO: If going to do len(results) check, then need to filter by business nature sooner # Len results heuristic doesn't work for empty searches, or the recursive search if len(exact_match) > 0: #or len(results) == 1: @@ -224,8 +159,8 @@ class LookupCompaniesHelper: for owner in owner_list: owner = owner.strip() # Clean owner name slightly - matches = self._get_potential_company_name_matches(owner) - temp_exact, temp_potential = self._separate_search_results(matches) + matches = self._get_business_search_results(owner) + temp_exact, temp_potential = self._separate_search_results(matches, owner) exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True) potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True) return exact_matches, potential_matches @@ -242,6 +177,33 @@ class LookupCompaniesHelper: exact_matches.to_csv(f'{self.output_path}/exact_matches_{x}.csv') potential_matches.to_csv(f'{self.output_path}/potential_matches_{x}.csv') +def get_business_details(business_id): + """ Get business details from the Corporation and charities filing database. + """ + url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}" + # Old search URL, holding onto in case the above gets blocked + # url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id) + if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")): + with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: + return json.load(f) + else: + r = requests.get(url) + # Try to read the response text + try: + r_json = json.loads(r.text) + except: + r_json = {} + + try: + # TODO: Will this write an empty string if no actual request result? + with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: + str_json = json.dumps(r_json) + f.write(str_json) + except: + pass + return r_json + +# Not currently in use, needs to be updated class GroupCompaniesHelper: def __init__(self, out_path: str, out_name: str): self.output_path = out_path # The path to the output file to save the output file @@ -390,4 +352,4 @@ class GroupCompaniesHelper: results.to_csv(f"{self.output_path}/{self.output_name}") results.to_csv(f"{self.output_path}/{self.output_name}") - return results \ No newline at end of file + return results diff --git a/processors/gre-llc.py b/processors/gre-llc.py deleted file mode 100644 index 52989f0..0000000 --- a/processors/gre-llc.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3get_company_list_name_matches -# -*- coding: utf-8 -*- -""" -Created on Fri Aug 15 19:06:45 2025 - -@author: linnea - -Script to find exact and potential search results for a parcel owner in the CCFS database - -A representative example for the parcel owner (assessor) data scraping step -Address: 308 4th Ave S, Seattle, WA, 98104 -ParcelNumber: 5247801370 -ListedOwner: GRE DOWNTOWNER LLC -PreviousBuyer: CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY - -We happen to already know the answer, -which is this address is part of Goodman Real Estate's extensive portfolio -GRE List: https://goodmanre.com/our-projects/ - -TODO: - - Make a flag that shows if the buywer / owner are similar - - Get the address field from CCFS, put in corp_owners - - If the previous buyer doesn't make sense, - get the year of the last buying to see if it's at all recent for sanity checks -""" - -from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper -import pandas as pd - -lookup_helper = LookupCompaniesHelper(("../data/intermediates")) - -# Option 1: Uncomment the two lines to run the full script. -# df = pd.read_csv("../data/intermediates/owners_listed.csv") -# owner_names = df["ListedOwner"].unique() - -# Option 2: Uncomment two lines to run with a specific subset for debugging -df = pd.read_excel("../experiments/gre_apartments.ods", engine='odf') -df = df.iloc[1] -owner_names = [df["ListedOwner"]] - -exact, potential = lookup_helper.get_company_list_name_matches(owner_names) - - diff --git a/processors/test.py b/processors/test.py new file mode 100644 index 0000000..f984d80 --- /dev/null +++ b/processors/test.py @@ -0,0 +1,25 @@ +""" +Helper script for testing out changes to business lookup. +Uses the GRE data that we were able to collect by hanad for verification. +""" + +from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper +import pandas as pd +import os + +lookup_helper = LookupCompaniesHelper(("../data/intermediates")) + +print(os.getcwd()) +df = pd.read_excel("./experiments/gre_apartments.ods", engine='odf') + +# Option 1: iterate through the whole list of GRE apartment names +owner_names = df["ListedOwner"].unique() + +# Option 2: pick a specific owner name +# owner_names = ["GRE 4TH AVE S LLC"] + + +exact, potential = lookup_helper.get_company_list_name_matches(owner_names) + +exact.to_csv("./data/intermediates/exact.csv") +potential.to_csv("./data/intermediates/potential.csv") diff --git a/requirements.txt b/requirements.txt index 1bed897..beeeac8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,6 @@ argon2-cffi-bindings==25.1.0 cffi==2.0.0 minio==7.2.16 pycryptodome==3.23.0 +dotenv==0.9.9 +python-dotenv==1.2.1 +psycopg2-binary==2.9.10