2026-01-22 00:14:58 +00:00 · 2026-01-21 04:05:36 +00:00 · 2026-01-21 04:12:12 +00:00
7 changed files with 130 additions and 179 deletions
--- a/.gitignore
+++ b/.gitignore
@ -6,4 +6,5 @@ tmp/
 *.org~
 data/
 venv/
-*~
+*~
+.env
--- a/README.md
+++ b/README.md
@ -11,7 +11,6 @@ Relevant but not 1-1 walkthrough of how to programmatically find building owners
 ## Project Structure

 - `processors/`: directory for Python scripts to extract, transform, and load the data
- `postgis-data/`: directory that will hold the PostgreSQL data
 - `data/`: currently ignored by git, need to share manually
    - `inputs/`: directory with only files that come directly from APIs or public websites
    - `intermediates/`: directory for folders containing intermediate transformed versions of the inputs
@ -27,6 +26,7 @@ TODO: Find a good source for eviction filing data. Those with access can refer t

 ## Object Storage

+### King County Assessor Data
 An S3 compatible storage is hosted on [minio.radmin.live](minio.radmin.live)

 SDK documentation: https://github.com/minio/minio-py/blob/master/docs/API.md
@ -36,3 +36,6 @@ Use `lib/minio_helper.py` to extend the functionality
 Run `test_minio` in `lib/main.py` to test out that it works (TODO: move this to own testing script, perhaps unit tests)

 Note: You will need to have minio_access_key and minio_secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys)
+
+### CCFS Data
+We have our own copy of the [CCFS database](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research). Contact @linnealovespie or @jessib to get the `.env` file needed to load in the connection string secrets. 
--- a/experiments/gre_apartments.ods
+++ b/experiments/gre_apartments.ods
--- a/processors/corp_owners.py
+++ b/processors/corp_owners.py
@ -14,155 +14,90 @@ import json
 import os
 import re
 import urllib.parse
+import psycopg2
+from dotenv import load_dotenv

-search_for_business_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'
-# Old search URL, holding onto in case the above gets blocked
-# search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList'
-principal_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList'
-
-principal_headers = {
-    'Accept-Language': 'en-US,en;q=0.8,es-AR;q=0.5,es;q=0.3',
-    'Referer': 'https://ccfs.sos.wa.gov/',
-    'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', # this might be an issue
-    'Origin': 'https://ccfs.sos.wa.gov'
-}
-
-def get_business_search_payload(business_name, page_count, page_num):
-    return {
-        'Type': 'BusinessName',
-        'SearchEntityName': business_name,
-        'SearchType': 'BusinessName',
-        'SortType': 'ASC',
-        'SortBy': 'Entity Name',
-        'SearchValue': business_name,
-        'SearchCriteria': 'Contains',
-        'IsSearch': 'true',
-        'PageID': page_num,
-        'PageCount': page_count,
-    }
-
-def get_business_details(business_id):
-    """ Get business details from the Corporation and charities filing database. 
-    """
-    url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}"
-    # Old search URL, holding onto in case the above gets blocked
-    # url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
-    if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")):
-        with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: 
-            return json.load(f)
-    else: 
-        r = requests.get(url)
-        # Try to read the response text
-        try: 
-            r_json = json.loads(r.text)
-        except: 
-            r_json = {}
-    
-        try:
-            # TODO: Will this write an empty string if no actual request result? 
-            with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: 
-                str_json = json.dumps(r_json)
-                f.write(str_json)
-        except:
-            pass
-        return r_json
+load_dotenv()

+DB_NAME = os.environ.get("DB_NAME")
+DB_USER = os.environ.get("DB_USER")
+DB_PASS = os.environ.get("DB_PASS")
+DB_HOST = os.environ.get("DB_HOST")
+DB_PORT = os.environ.get("DB_PORT")

 class LookupCompaniesHelper:
    def __init__(self, out_path: str):
        self.output_path = out_path # Absolute path to where the file will be saved

    def _get_empty_df(self):
-        return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 
-                                           'Address', 'Status', 'address_match'])
+        return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName','address_match'])
    
-    def _get_business_search_results(self, business_name_orig, page_num):
+    def _query_db(self,business_name):   
+        conn = psycopg2.connect(database=DB_NAME,
+                    user=DB_USER,
+                    password=DB_PASS,
+                    host=DB_HOST,
+                    port=DB_PORT)
+        conn.autocommit = True
+        with conn.cursor() as cur:
+            cur.execute("""
+                select
+                    corporations."Ubi", 
+                    corporations."BusinessName", 
+                    corporations."Type",
+                    corporations."TypeDescription",
+                    corporations."RecordStatus",
+                    business_info."MailingAddressLine1",
+                    business_info."MailingCity",
+                    business_info."MailingState",
+                    business_info."MailingCountry",
+                    business_info."MailingZip5"
+                from business_info
+                inner join corporations
+                on business_info."Ubi" = corporations."Ubi"
+                where corporations."BusinessName" ~ %s
+                limit 10
+                ;
+            """, (business_name+"*",))
+            rows = cur.fetchall()
+
+        row_names = ["Ubi", "BusinessName","Type","TypeDescription","RecordStatus", "MailingAddressLine1", "MailingCity","MailingState","MailingCountry","MailingZip5"]
+        table = []
+        for r in rows: 
+            table += [dict(zip(row_names, r))]
+        return table
+
+    def _get_business_search_results(self, business_name_orig):
        business_name = business_name_orig.strip()
        no_result = True
        result = {}
        while no_result and len(business_name) > 0:
            print(f"searching with name {business_name}")
-            r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num))
-            # TODO: add some more error handling in case of connectivity issues. 
-            if r.status_code == 429: 
-                # TODO: Raise an error
-                print("This IP address has likely been blocked by CCFS, try using a vpn")
-            result = json.loads(r.text)
-            if len(result) > 0:
+            result = self._query_db(business_name)
+
+            # If no search results, try removing the last word in the name
+            # This seems to be a decent heuristic because final words are things like LTD, APTS
+            # TODO: A more robust search function could make this irrelevant
+            if(len(result) > 0):
                no_result = False
            else:
-                # Strip off the last word from the search term and try again next iteration
-                try: 
-                    # Get the index of the last space in the name
-                    last_space = business_name[::-1].index(" ") 
-                    business_name = business_name[: -1 - last_space].strip()
-                except ValueError: 
-                    # TODO: In this case, try with the LastBuyer in stead of ListedOwner? 
-                    print(f"Found no business with name {business_name_orig}\n")
-                    business_name = ""
-                    
-                
-        return result
+                # Get the index of the last space in the name
+                last_space = business_name[::-1].index(" ") 
+                business_name = business_name[: -1 - last_space].strip()

-    def _extract_search_results(self, search_term, search_req_response):
-        res_list = []
-        for res in search_req_response:
-            # build up the known responses
-            # get more business data from that id
-            business_info = get_business_details(res["BusinessID"])
-            res_list += [[search_term.strip(), 
-                          res.get('BusinessName').strip(), 
-                          res.get('UBINumber'), 
-                          res.get('BusinessID'),
-                          res.get('PrincipalOffice')['PrincipalStreetAddress']['FullAddress'], 
-                          res.get("BusinessStatus"), 
-                          business_info.get("BINAICSCodeDesc", "NOT_FOUND")]]
-        # return an empty row if no search results
-        if len(search_req_response) == 0: 
-            res_list += [[search_term, "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND"]]
-        
-        res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status", "BusinessNature"])
-        
-        # Clean some of the results a bit more:
-        # Keep only active companies and searches that yielded no results
-        res_df = res_df[(res_df["Status"]=="Active") | (res_df["Status"]=="NOT_FOUND")]
-
-        # TODO: Maybe add a filter on BusinessNature for only real estate/ property investments
-        # TODO: First need to get an idea of all the BusinessNature types
-
-        # Keep a list of exact matches, or later build a list of potential matches that we give to human verifiers
-        # This check is very simple heuristic and more robust matching will occur later in processing
-        exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist()
-        if exact_match:
-            res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0)
-            
-        return res_df
-
-    def _determine_search_matches(self, search_results_df):
-        """
-            Mark row as potential match: UBI number is a duplicate, or Address is the same
-            df.duplicated just sees if that address is already in the dataframe, NOT that the serach term
-            and result have the same address. Could add search terms as a subset for duplicated call
-        """
-        search_results_df['address_match'] = search_results_df.duplicated(subset=['Address'], keep=False) 
-
-    def _get_all_company_name_match_search_results(self, owner_name):
-        n = 1
-        res_length = 100
-        search_results = []
-        
-        res = self._get_business_search_results(owner_name, n)     
-        return res
+        df = pd.DataFrame(result)
+        df["SearchTerm"] = business_name_orig
+        return df

    """
    """
-    def _get_potential_company_name_matches(self, owner_name):        
-        all_search_results = self._get_all_company_name_match_search_results(owner_name)
-        extracted_results = self._extract_search_results(owner_name, all_search_results)
-        self._determine_search_matches(extracted_results)
-        return extracted_results
+    def _get_potential_company_name_matches(self, owner_name):  
+        all_search_results = self._get_business_search_results(owner_name)      
+        df = pd.DataFrame(all_search_results)
+        df["SearchTerm"] = owner_name
+        return df

-    def _separate_search_results(self, results):
+    def _separate_search_results(self, results, searchTerm):
        """
            utils to separate search results into exact match, potential match (where no exact match was found), 
            and additional matches (extra matches if there was an exact match and additional matches)
@ -171,9 +106,9 @@ class LookupCompaniesHelper:
                - Partnership
                - etc.
        """
-        def is_exact_match(row):
+        def is_exact_match(row, searchTerm):
            """ Extract exact matches, including some regex magic. """
-            search = row["SearchTerm"]
+            search = searchTerm
            result = row["BusinessName"]
            
            # examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC.
@ -201,7 +136,7 @@ class LookupCompaniesHelper:
        exact_matches = self._get_empty_df()
        potential_matches = self._get_empty_df()
        
-        exact_match = results[results.apply(lambda row: is_exact_match(row), axis=1)]
+        exact_match = results[results.apply(lambda row: is_exact_match(row, searchTerm), axis=1)]
        # TODO: If going to do len(results) check, then need to filter by business nature sooner
        # Len results heuristic doesn't work for empty searches, or the recursive search
        if len(exact_match) > 0: #or len(results) == 1:
@ -224,8 +159,8 @@ class LookupCompaniesHelper:
        
        for owner in owner_list:
            owner = owner.strip() # Clean owner name slightly
-            matches = self._get_potential_company_name_matches(owner)
-            temp_exact, temp_potential = self._separate_search_results(matches)
+            matches = self._get_business_search_results(owner)
+            temp_exact, temp_potential = self._separate_search_results(matches, owner)
            exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True)
            potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True)
        return exact_matches, potential_matches
@ -242,6 +177,33 @@ class LookupCompaniesHelper:
        exact_matches.to_csv(f'{self.output_path}/exact_matches_{x}.csv')
        potential_matches.to_csv(f'{self.output_path}/potential_matches_{x}.csv')

+def get_business_details(business_id):
+    """ Get business details from the Corporation and charities filing database. 
+    """
+    url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}"
+    # Old search URL, holding onto in case the above gets blocked
+    # url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id)
+    if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")):
+        with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: 
+            return json.load(f)
+    else: 
+        r = requests.get(url)
+        # Try to read the response text
+        try: 
+            r_json = json.loads(r.text)
+        except: 
+            r_json = {}
+    
+        try:
+            # TODO: Will this write an empty string if no actual request result? 
+            with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: 
+                str_json = json.dumps(r_json)
+                f.write(str_json)
+        except:
+            pass
+        return r_json
+
+# Not currently in use, needs to be updated
 class GroupCompaniesHelper:
    def __init__(self, out_path: str, out_name: str):
        self.output_path = out_path # The path to the output file to save the output file
@ -390,4 +352,4 @@ class GroupCompaniesHelper:
                results.to_csv(f"{self.output_path}/{self.output_name}")
                
        results.to_csv(f"{self.output_path}/{self.output_name}")
-        return results
+        return results
--- a/processors/gre-llc.py
+++ b/processors/gre-llc.py
@ -1,43 +0,0 @@
-#!/usr/bin/env python3get_company_list_name_matches
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Aug 15 19:06:45 2025
-
-@author: linnea
-
-Script to find exact and potential search results for a parcel owner in the CCFS database
-
-A representative example for the parcel owner (assessor) data scraping step
-Address: 308 4th Ave S, Seattle, WA, 98104
-ParcelNumber: 5247801370
-ListedOwner: GRE DOWNTOWNER LLC      
-PreviousBuyer: CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY
-
-We happen to already know the answer, 
-which is this address is part of Goodman Real Estate's extensive portfolio   
-GRE List: https://goodmanre.com/our-projects/
-
-TODO: 
-    - Make a flag that shows if the buywer / owner are similar
-    - Get the address field from CCFS, put in corp_owners
-    - If the previous buyer doesn't make sense, 
-        get the year of the last buying to see if it's at all recent for sanity checks
-"""
-
-from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper
-import pandas as pd
-
-lookup_helper = LookupCompaniesHelper(("../data/intermediates"))
-
-# Option 1: Uncomment the two lines to run the full script. 
-# df = pd.read_csv("../data/intermediates/owners_listed.csv")
-# owner_names = df["ListedOwner"].unique()
-
-# Option 2: Uncomment two lines to run with a specific subset for debugging
-df = pd.read_excel("../experiments/gre_apartments.ods", engine='odf')
-df = df.iloc[1]
-owner_names = [df["ListedOwner"]]
-
-exact, potential = lookup_helper.get_company_list_name_matches(owner_names)
-
-
--- a/processors/test.py
+++ b/processors/test.py
@ -0,0 +1,25 @@
+"""
+Helper script for testing out changes to business lookup. 
+Uses the GRE data that we were able to collect by hanad for verification. 
+"""
+
+from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper
+import pandas as pd
+import os
+
+lookup_helper = LookupCompaniesHelper(("../data/intermediates"))
+
+print(os.getcwd())
+df = pd.read_excel("./experiments/gre_apartments.ods", engine='odf')
+
+# Option 1: iterate through the whole list of GRE apartment names
+owner_names = df["ListedOwner"].unique()
+
+# Option 2: pick a specific owner name
+# owner_names = ["GRE 4TH AVE S LLC"]
+
+
+exact, potential = lookup_helper.get_company_list_name_matches(owner_names)
+
+exact.to_csv("./data/intermediates/exact.csv")
+potential.to_csv("./data/intermediates/potential.csv")
--- a/requirements.txt
+++ b/requirements.txt
@ -30,3 +30,6 @@ argon2-cffi-bindings==25.1.0
 cffi==2.0.0
 minio==7.2.16
 pycryptodome==3.23.0
+dotenv==0.9.9
+python-dotenv==1.2.1
+psycopg2-binary==2.9.10