diff --git a/README.md b/README.md index 3ed9f13..0e3373a 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # aemp-seattle -Initial repository for building up Seattle database for anti-eviction mapping. +Initial repository for building up Seattle database for anti-eviction mapping. -Modelled off the [evictorbase pipeline code](https://github.com/antievictionmappingproject/eb-data-pipeline). +Modelled off the [evictorbase pipeline code](https://github.com/antievictionmappingproject/eb-data-pipeline). -Relevant but not 1-1 walkthrough of how to programmatically find building owners: [350 Seattle BEPS Repo](https://github.com/BenBagBag/350_seattle_building_ownership/blob/main/How%20to%20Find%20Building%20Owners.ipynb). +Relevant but not 1-1 walkthrough of how to programmatically find building owners: [350 Seattle BEPS Repo](https://github.com/BenBagBag/350_seattle_building_ownership/blob/main/How%20to%20Find%20Building%20Owners.ipynb). [AEMP Seattle Community Agreements](https://docs.google.com/document/d/1ZMeRmPWmhxynBXZ-aV6R2sQBktjNYRL9Xw9PHpkVpJE/edit?usp=drive_link) @@ -18,11 +18,12 @@ Relevant but not 1-1 walkthrough of how to programmatically find building owners - `to_load/`: directory for files that can be loaded directly into the PostgreSQL database - `experiments/`: directory for Jupyter notebooks for data exploration and script development -## Data Inputs: -[eRealProperty](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): King County assessor data for finding the owner of a given parcel. -[Washington State Corporations and Charities Filing Database (CCFS)](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): For looking up a parcel owner name and finding the related business listing and related info. - -TODO: Find a good source for eviction filing data. Those with access can refer to the [potential data source list](https://docs.google.com/spreadsheets/d/1Ew0UrZvP-S74velkWSKaiSGBYcxIAoRH6IGpEzNWX6s/edit?gid=0#gid=0) to find new data sources. +## Data Inputs: +[King County Assessor:](https://info.kingcounty.gov/assessor/DataDownload/default.aspx) Download records of all apartment complexes in King County. +[eRealProperty](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): King County assessor data for finding the owner of a given parcel. +[Washington State Corporations and Charities Filing Database (CCFS)](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): For looking up a parcel owner name and finding the related business listing and related info. + +TODO: Find a good source for eviction filing data. Those with access can refer to the [potential data source list](https://docs.google.com/spreadsheets/d/1Ew0UrZvP-S74velkWSKaiSGBYcxIAoRH6IGpEzNWX6s/edit?gid=0#gid=0) to find new data sources. ## Object Storage @@ -34,4 +35,4 @@ Use `lib/minio_helper.py` to extend the functionality Run `test_minio` in `lib/main.py` to test out that it works (TODO: move this to own testing script, perhaps unit tests) -Note: You will need to have access_key and secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys) +Note: You will need to have minio_access_key and minio_secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys) diff --git a/experiments/aemp.org b/experiments/aemp.org index 674a7c8..76b826f 100755 --- a/experiments/aemp.org +++ b/experiments/aemp.org @@ -85,3 +85,11 @@ Can then determine which one to use ** TODO: Do some data cleaning to have names be the same eg. Seattle city of is #1 and #4 most common property owner names, should be standardized eg. LLC, LLP, L.L.C. etc. all to one format + +* 28 August +From Dox: 610 Harvard Ave East +Seattle, WA 981027 +Intense management mispractice, would like to know more about the above address. +** TODO: At some point, cross-reference with registered rental data +https://data.seattle.gov/Built-Environment/Rental-Property-Registration/j2xh-c7vt/about_data +lf diff --git a/experiments/gre_apartments.ods b/experiments/gre_apartments.ods index 4f79295..e3c4516 100644 Binary files a/experiments/gre_apartments.ods and b/experiments/gre_apartments.ods differ diff --git a/lib/minio_helper.py b/lib/minio_helper.py index ba6392b..8d3c447 100644 --- a/lib/minio_helper.py +++ b/lib/minio_helper.py @@ -7,8 +7,8 @@ class MinioHelper: def __init__(self, bucket_name: str): self.client = Minio( "minio.radmin.live", - access_key=os.environ['access_key'], - secret_key=os.environ['secret_key'] + access_key=os.environ['minio_access_key'], + secret_key=os.environ['minio_secret_key'] ) self.bucket_name = bucket_name diff --git a/processors/corp_owners.py b/processors/corp_owners.py index 07e300c..56f40b8 100644 --- a/processors/corp_owners.py +++ b/processors/corp_owners.py @@ -1,5 +1,10 @@ """ - Utility functions for extracting owners. + Utility functions for + 1. LookupCompaniesHelper: looking up a parcel owner in the WA Corporations and Charities Database, + and extracting the best search result. + 2. GroupCompaniesHelper (WIP): given a company's stated governors and addresses, + groups together addresses that likely share the same landlord. + """ import pandas as pd @@ -8,13 +13,10 @@ import requests import json import os import re -# import geopandas as gp import urllib.parse -# Utils for finding principals - - search_for_business_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' +# Old search URL, holding onto in case the above gets blocked # search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' principal_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList' @@ -40,11 +42,30 @@ def get_business_search_payload(business_name, page_count, page_num): } def get_business_details(business_id): - """ Get business details from the Corporation and charities filing database. """ + """ Get business details from the Corporation and charities filing database. + """ url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}" + # Old search URL, holding onto in case the above gets blocked # url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id) - r = requests.get(url) - return json.loads(r.text) + if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")): + with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: + return json.load(f) + else: + r = requests.get(url) + # Try to read the response text + try: + r_json = json.loads(r.text) + except: + r_json = {} + + try: + # TODO: Will this write an empty string if no actual request result? + with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: + str_json = json.dumps(r_json) + f.write(str_json) + except: + pass + return r_json class LookupCompaniesHelper: @@ -53,26 +74,68 @@ class LookupCompaniesHelper: def _get_empty_df(self): return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', - 'Address', 'Status', 'address_match', 'ubi_match', 'id_match']) + 'Address', 'Status', 'address_match']) - def _get_business_search_results(self, business_name, page_num): - r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num)) - try: + def _get_business_search_results(self, business_name_orig, page_num): + business_name = business_name_orig.strip() + no_result = True + result = {} + while no_result and len(business_name) > 0: + print(f"searching with name {business_name}") + r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num)) + # TODO: add some more error handling in case of connectivity issues. + if r.status_code == 429: + # TODO: Raise an error + print("This IP address has likely been blocked by CCFS, try using a vpn") result = json.loads(r.text) - #return json.loads(r.text) - except: - result = {} + if len(result) > 0: + no_result = False + else: + # Strip off the last word from the search term and try again next iteration + try: + # Get the index of the last space in the name + last_space = business_name[::-1].index(" ") + business_name = business_name[: -1 - last_space].strip() + except ValueError: + # TODO: In this case, try with the LastBuyer in stead of ListedOwner? + print(f"Found no business with name {business_name_orig}\n") + business_name = "" + + return result def _extract_search_results(self, search_term, search_req_response): - res_list = [[search_term, res['BusinessName'], res['UBINumber'], res['BusinessID'], - res['PrincipalOffice']['PrincipalStreetAddress']['FullAddress'], res["BusinessStatus"]] - for res in search_req_response] - res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status"]) - # Basically keep a list of exact matches, and build a list of potential matches that we give to human verifiers + res_list = [] + for res in search_req_response: + # build up the known responses + # get more business data from that id + business_info = get_business_details(res["BusinessID"]) + res_list += [[search_term.strip(), + res.get('BusinessName').strip(), + res.get('UBINumber'), + res.get('BusinessID'), + res.get('PrincipalOffice')['PrincipalStreetAddress']['FullAddress'], + res.get("BusinessStatus"), + business_info.get("BINAICSCodeDesc", "NOT_FOUND")]] + # return an empty row if no search results + if len(search_req_response) == 0: + res_list += [[search_term, "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND"]] + + res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status", "BusinessNature"]) + + # Clean some of the results a bit more: + # Keep only active companies and searches that yielded no results + res_df = res_df[(res_df["Status"]=="Active") | (res_df["Status"]=="NOT_FOUND")] + + # TODO: Maybe add a filter on BusinessNature for only real estate/ property investments + # TODO: First need to get an idea of all the BusinessNature types + + # Keep a list of exact matches, or later build a list of potential matches that we give to human verifiers + # This check is very simple heuristic and more robust matching will occur later in processing exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist() if exact_match: res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0) + return res_df def _determine_search_matches(self, search_results_df): @@ -82,24 +145,16 @@ class LookupCompaniesHelper: and result have the same address. Could add search terms as a subset for duplicated call """ search_results_df['address_match'] = search_results_df.duplicated(subset=['Address'], keep=False) - search_results_df['ubi_match'] = search_results_df.duplicated(subset=['UBINumber'], keep=False) - search_results_df['id_match'] = search_results_df.duplicated(subset=['BusinessId'], keep=False) def _get_all_company_name_match_search_results(self, owner_name): n = 1 res_length = 100 search_results = [] - while res_length == 100: - res = self._get_business_search_results(owner_name, n) - search_results += (res) - n += 1 - res_length = len(res) - - return search_results + res = self._get_business_search_results(owner_name, n) + return res """ - TODO: Remove the ubi and address match, this does nothing to help """ def _get_potential_company_name_matches(self, owner_name): all_search_results = self._get_all_company_name_match_search_results(owner_name) @@ -111,40 +166,50 @@ class LookupCompaniesHelper: """ utils to separate search results into exact match, potential match (where no exact match was found), and additional matches (extra matches if there was an exact match and additional matches) + TODO: Give more robust answers here! Other abbreviations include: + - Apartment: APTS -> Apartments + - Partnership + - etc. """ def is_exact_match(row): """ Extract exact matches, including some regex magic. """ search = row["SearchTerm"] result = row["BusinessName"] - + # examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC. # Limited Partnership, Limited liability company p = re.compile("L[\s.]?L[\s,.]?[PC][.]" ,flags=re.IGNORECASE) - result=result.replace(",", "") + + replace_map = { + ",": "", + "LIMITED LIABILITY COMPANY":"LLC", + "LIMITED PARTNERSHIP": "LLC", + "APARTMENTS": "APTS", + "LTD PS": "LLC", + "LTD PARTNERSHIP": "LLC", + } + result= re.sub(p, "LLC", result) - result=result.replace("LIMITED LIABILITY COMPANY", "LLC") - result=result.replace("LIMITED PARTNERSHIP", "LLC") - - search=search.replace(",", "") search=re.sub(p, "LLC", search) - search=search.replace("LIMITED PARTNERSHIP", "LLC") - search=search.replace("LIMITED LIABILITY COMPANY", "LLC") + + for k,v in replace_map.items(): + result = result.replace(k, v) + search = search.replace(k, v) return search == result exact_matches = self._get_empty_df() - exact_matches.columns potential_matches = self._get_empty_df() - additional_matches = self._get_empty_df() exact_match = results[results.apply(lambda row: is_exact_match(row), axis=1)] - if len(exact_match) > 0: - exact_matches = pd.concat([exact_matches, exact_match], ignore_index=True) - additional_matches = pd.concat([additional_matches, results[results['SearchTerm'] != results['BusinessName']]], ignore_index=True) + # TODO: If going to do len(results) check, then need to filter by business nature sooner + # Len results heuristic doesn't work for empty searches, or the recursive search + if len(exact_match) > 0: #or len(results) == 1: + exact_matches = pd.DataFrame(results.iloc[0]).T else: potential_matches = pd.concat([potential_matches, results], ignore_index=True) - return exact_matches, potential_matches, additional_matches + return exact_matches, potential_matches def get_company_list_name_matches(self, owner_list: list): """ @@ -152,19 +217,18 @@ class LookupCompaniesHelper: owner_list: a list of owner names that will be searched in the CCFS database for matches. Exact_matches: when search term exactly matches a result in CCFS database. Potential_matches: when search term doesn't exactly match, there needs to be some human verification here to determine. - Additional_matches: extraneous matches in case potential_matches didn't yield enough results. """ exact_matches = self._get_empty_df() potential_matches = self._get_empty_df() - additional_matches = self._get_empty_df() + # TODO: Make a df for search terms with no matches and how to make it mesh well with recursive search for owner in owner_list: + owner = owner.strip() # Clean owner name slightly matches = self._get_potential_company_name_matches(owner) - temp_exact, temp_potential, temp_add = self._separate_search_results(matches) + temp_exact, temp_potential = self._separate_search_results(matches) exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True) potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True) - additional_matches = pd.concat([temp_add, additional_matches], ignore_index=True) - return exact_matches, potential_matches, additional_matches + return exact_matches, potential_matches def get_company_matches_and_export(self, owner_list: list, x: int): @@ -173,11 +237,10 @@ class LookupCompaniesHelper: match CSV's in the folder determined by `output_path` """ print(f"Saving output files to {self.output_path}") - exact_matches, potential_matches, additional_matches = self.get_company_list_name_matches(owner_list) + exact_matches, potential_matches = self.get_company_list_name_matches(owner_list) exact_matches.to_csv(f'{self.output_path}/exact_matches_{x}.csv') potential_matches.to_csv(f'{self.output_path}/potential_matches_{x}.csv') - additional_matches.to_csv(f'{self.output_path}/additional_matches_{x}.csv') class GroupCompaniesHelper: def __init__(self, out_path: str, out_name: str): @@ -261,7 +324,7 @@ class GroupCompaniesHelper: return principals business_ids = [res['BusinessID'] for res in search_results] business_names = [res['BusinessName'] for res in search_results] - ubi_nums = [res['UBINumber'] for res in search_results] + # ubi_nums = [res['UBINumber'] for res in search_results] for id, name in zip(business_ids, business_names): business_json = get_business_details(id) diff --git a/processors/gre-llc.py b/processors/gre-llc.py index 509c4d2..52989f0 100644 --- a/processors/gre-llc.py +++ b/processors/gre-llc.py @@ -5,44 +5,39 @@ Created on Fri Aug 15 19:06:45 2025 @author: linnea +Script to find exact and potential search results for a parcel owner in the CCFS database + +A representative example for the parcel owner (assessor) data scraping step Address: 308 4th Ave S, Seattle, WA, 98104 ParcelNumber: 5247801370 ListedOwner: GRE DOWNTOWNER LLC PreviousBuyer: CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY - +We happen to already know the answer, +which is this address is part of Goodman Real Estate's extensive portfolio GRE List: https://goodmanre.com/our-projects/ TODO: - Make a flag that shows if the buywer / owner are similar - - Check the fuzzy wuzzy matching in utils - Get the address field from CCFS, put in corp_owners - If the previous buyer doesn't make sense, get the year of the last buying to see if it's at all recent for sanity checks - -1. Load in the whole dataframe of owners and buyers -2. Get the whole list of responses for the listed owner - - This shows all the companies that match the listed owner in assessor data - - Need to find the most likely company in CCFS to match the listed owner -3. Make a df out of? -4. """ from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper import pandas as pd lookup_helper = LookupCompaniesHelper(("../data/intermediates")) -df = pd.read_csv("../data/intermediates/owners_listed.csv") -# Almost never need additional matches, as it's only populated if there's an exact match -exact, potential, additional = lookup_helper.get_company_list_name_matches(["GRE DOWNTOWNER LLC"]) -owner_names = df["ListedOwner"].unique() -# exact, potential, additional = lookup_helper.get_company_list_name_matches(owner_names[:10]) +# Option 1: Uncomment the two lines to run the full script. +# df = pd.read_csv("../data/intermediates/owners_listed.csv") +# owner_names = df["ListedOwner"].unique() + +# Option 2: Uncomment two lines to run with a specific subset for debugging +df = pd.read_excel("../experiments/gre_apartments.ods", engine='odf') +df = df.iloc[1] +owner_names = [df["ListedOwner"]] + +exact, potential = lookup_helper.get_company_list_name_matches(owner_names) -if(len(exact) >= 1): - ubi = exact.loc[0, "UBINumber"] - -group_helper= GroupCompaniesHelper("../data/intermediates", "principals") -# TODO: Figure out how to format the url for proper response -res_group = group_helper.get_companies_principals(exact) diff --git a/processors/merge.py b/processors/merge.py index 50302e0..ef619a0 100644 --- a/processors/merge.py +++ b/processors/merge.py @@ -2,9 +2,12 @@ # -*- coding: utf-8 -*- """ Created on Tue Aug 12 18:17:47 2025 - @author: linnea +One-time script for cleaning up parcel lookup data. +If everything went 100% smoothly in scrape.py (ie. no search results came back empty), +then this script shouldn't be needed. + 1. Load intermediate results 2. Load original data with unmodified parcelid 3. Add a taxparcelid to unmodified so can merge @@ -66,9 +69,7 @@ if __name__ == "__main__": # Add address from df_apts to df_raw df_join = df_apts.merge(df_raw, 'left', on="ParcelNumber") df_join["ListedOwner"] = "NOT_FOUND" - # df_join["ListedOwner"] = df_join.apply(lambda row: get_listed_owner(row), axis=1) for idx, row in df_join.iterrows(): - # df_join.loc[idx, "ListedOwner"] = get_listed_owner(row) row.ListedOwner = get_listed_owner(row) df_join.loc[idx] = row if idx % 500 == 0: @@ -76,7 +77,6 @@ if __name__ == "__main__": df_join.to_csv(f"{intermediates_path}/owners_listed.csv") df_join.to_csv(f"{intermediates_path}/owners_listed.csv") - # df_join = df_join.rename(columns={"Owner":"RecentBuyer"}) diff --git a/processors/parcel_owners.py b/processors/parcel_owners.py index e1663e5..a82d1b5 100644 --- a/processors/parcel_owners.py +++ b/processors/parcel_owners.py @@ -1,3 +1,8 @@ +""" + Utils for finding a parcel owner given an address. + Data source is King County Assessor. +""" + import pandas as pd from bs4 import BeautifulSoup import requests @@ -43,13 +48,7 @@ class ParcelLookupHelper: if data_not_found: return None return html_soup - - # TODO: Maybe include sales history AND current owner? - # Example: 308 4TH AVE S 98104 - # Current owner = GRE DOWNTOWNER LLC - # Sales history = CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY - # Website also shows GRE DOWNTOWNER https://www.addisononfourth.com/ - # TODO: cache the whole soup object so can lookup later? + s def _get_owner_name_from_soup(self, soup: object): """ Extract the owner name from a given BeautifulSoup object, `soup`, of a Property Detail page. @@ -60,34 +59,8 @@ class ParcelLookupHelper: parent = title.parent next_tr = title and parent.find_next('tr') table = next_tr and next_tr.table - return table and table.find_all('td')[5].text - - def _get_num_units_and_types_from_soup(self, soup: object): - """ - Given a BeautifulSoup object, `soup`, of a Property Detail page, extract: - - the number of units in the building - - the unit types - - the sq ft of each unit type - - number of bed/bath rooms in each unit type - """ - title = soup.find('span', text = 'Unit Breakdown') - if not title: - return { 'numUnits': 'NOT_FOUND', 'unitDetails': 'NOT_FOUND' } - - table = title and title.find_next('div').table - table_rows = table and table.find_all('tr')[1:] - cells = table_rows and [row.find_all('td') for row in table_rows] - table_data = [] - - for c in cells: - table_data.append([span.text for span in c]) - total_units = sum([int(row[1]) for row in table_data]) - dict_keys = ['type', 'number', 'sqft', 'bed', 'bath'] - units = [dict(zip(dict_keys, row)) for row in table_data] - return { 'numUnits': total_units, 'unitDetails': units } + return table and table.find_all('td')[5].texts - - # TODO: pass maybe a list of features want to extract? def _scrape_parcel_owners(self, tax_parcel_id_numbers: list, file_name: str): @@ -110,6 +83,10 @@ class ParcelLookupHelper: self._write_parcel_owner_csv(parcel_df, file_name) def _save_html(self, soup, id): + """ + Given a 'soup' type response for an address lookup, save + as an html file for future lookups. + """ table = soup.find("table", attrs={"class":"_table2", "id":"TABLE1"}) with open(f"{self.output_path}/html/{id}.html", 'w') as f: f.write(str(table)) @@ -122,7 +99,6 @@ class ParcelLookupHelper: return "NOT FOUND" else: return self._get_owner_name_from_soup(parcel_soup) - # parcel_df.loc[len(parcel_df.index)] = [id, owner_name] def _scrape_parcel_owners_and_unit_details(self, tax_parcel_id_numbers: list, file_name: str): diff --git a/processors/requirements-conda.txt b/processors/requirements-conda.txt new file mode 100644 index 0000000..d0b6b33 --- /dev/null +++ b/processors/requirements-conda.txt @@ -0,0 +1,69 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +# created-by: conda 25.5.1 +_libgcc_mutex=0.1=main +_openmp_mutex=5.1=1_gnu +beautifulsoup4=4.13.5=py313h06a4308_0 +blas=1.0=mkl +bottleneck=1.4.2=py313hf0014fa_0 +brotlicffi=1.0.9.2=py313h6a678d5_1 +bs4=4.13.5=py39hd3eb1b0_0 +bzip2=1.0.8=h5eee18b_6 +ca-certificates=2025.9.9=h06a4308_0 +certifi=2025.8.3=py313h06a4308_0 +cffi=2.0.0=pypi_0 +charset-normalizer=3.3.2=pyhd3eb1b0_0 +defusedxml=0.7.1=pyhd3eb1b0_0 +expat=2.7.1=h6a678d5_0 +idna=3.7=py313h06a4308_0 +intel-openmp=2025.0.0=h06a4308_1171 +ld_impl_linux-64=2.40=h12ee557_0 +libffi=3.4.4=h6a678d5_1 +libgcc-ng=11.2.0=h1234567_1 +libgomp=11.2.0=h1234567_1 +libmpdec=4.0.0=h5eee18b_0 +libstdcxx-ng=11.2.0=h1234567_1 +libuuid=1.41.5=h5eee18b_0 +libxcb=1.17.0=h9b100fa_0 +libzlib=1.3.1=hb25bd0a_0 +mkl=2025.0.0=hacee8c2_941 +mkl-service=2.4.0=py313h5eee18b_3 +mkl_fft=1.3.11=py313hacdc0fc_1 +mkl_random=1.2.8=py313h8928b4f_1 +ncurses=6.5=h7934f7d_0 +numexpr=2.11.0=py313h41d4191_1 +numpy=2.3.3=py313h720eef7_0 +numpy-base=2.3.3=py313h95072fd_0 +odfpy=1.4.1=pyhd8ed1ab_1 +openssl=3.0.17=h5eee18b_0 +pandas=2.3.2=py313h280b501_0 +pip=25.2=pyhc872135_0 +pthread-stubs=0.3=h0ce48e5_1 +pycparser=2.23=py313h06a4308_0 +pysocks=1.7.1=py313h06a4308_0 +python=3.13.7=h7e8bc2b_100_cp313 +python-dateutil=2.9.0post0=py313h06a4308_2 +python-tzdata=2025.2=pyhd3eb1b0_0 +python_abi=3.13=1_cp313 +pytz=2025.2=py313h06a4308_0 +readline=8.3=hc2a1206_0 +requests=2.32.5=py313h06a4308_0 +setuptools=72.1.0=py313h06a4308_0 +six=1.17.0=py313h06a4308_0 +soupsieve=2.5=py313h06a4308_0 +sqlite=3.50.2=hb25bd0a_1 +tbb=2022.0.0=hdb19cb5_0 +tbb-devel=2022.0.0=hdb19cb5_0 +tk=8.6.15=h54e0aa7_0 +typing-extensions=4.15.0=py313h06a4308_0 +typing_extensions=4.15.0=py313h06a4308_0 +tzdata=2025b=h04d1e81_0 +urllib3=2.5.0=py313h06a4308_0 +wheel=0.45.1=py313h06a4308_0 +xorg-libx11=1.8.12=h9b100fa_1 +xorg-libxau=1.0.12=h9b100fa_0 +xorg-libxdmcp=1.1.5=h9b100fa_0 +xorg-xorgproto=2024.1=h5eee18b_1 +xz=5.6.4=h5eee18b_1 +zlib=1.3.1=hb25bd0a_0 diff --git a/processors/scrape.py b/processors/scrape.py index 8ed090b..cd8e2e6 100644 --- a/processors/scrape.py +++ b/processors/scrape.py @@ -1,13 +1,15 @@ +""" +Script for getting all apartment addressses in King County, +and looking up their parcel owners in KC Assessor. +Only runs in set increments to avoid being blocked by Assessor site. +""" import pandas as pd -from selenium import webdriver -from selenium.webdriver.common.by import By -import time import os from parcel_owners import ParcelLookupHelper -incr = 998 +incr = 998 # Number of addressses to look up per run. -class ParcelScraper: +class ApartmentDataLoader: def __init__(self, path): self.path = path # path to the csv # self.driver = self.load_driver() @@ -35,31 +37,6 @@ class ParcelScraper: if idx % 50 == 0: print(f"Saving row {idx}") self.df.to_csv("apartments_with_owners.csv") - - def submit_parcel(self, parcel): - self.driver.get("https://blue.kingcounty.com/Assessor/eRealProperty/default.aspx") - print(f"https://blue.kingcounty.com/Assessor/eRealProperty/Dashboard.aspx?ParcelNbr={parcel}") - self.driver.get(f"https://blue.kingcounty.com/Assessor/eRealProperty/Dashboard.aspx?ParcelNbr={parcel}") - parcel_name = "" - try: - # parcel_form = self.driver.find_element(By.ID, "cphContent_txtParcelNbr") - # parcel_form.send_keys(parcel) - - # search_box = self.driver.find_element(By.NAME, "kingcounty_gov$cphContent$btn_Search") - # search_box.click() - - # Wait until the table view has loaded - # table_loaded = self.driver.find_element(By.ID, "topnavlistbtn") - # wait = WebDriverWait(self.driver, timeout=5) - # wait.until(lambda _: table_loaded.is_displayed()) - - name = self.driver.find_element(By.XPATH, "/html/body/form/table/tbody/tr/td[2]/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/div/table/tbody/tr[2]/td[2]") - parcel_name = name.text - # print(name.text) - except: - print(f"Couldn't find parcel name for parcel number {parcel}") - - return parcel_name def get_parcel_number(self, major, minor): return str(major).rjust(6, "0") + str(minor).rjust(4,"0") @@ -73,10 +50,9 @@ if __name__ == "__main__": pass print(f"starting at index {nrows}") - - scraper = ParcelScraper("EXTR_AptComplex.csv") - df = scraper.df.loc[nrows:nrows + incr] + loader = ApartmentDataLoader("EXTR_AptComplex.csv") + df = loader.df.loc[nrows:nrows + incr] parcelHelper = ParcelLookupHelper(os.getcwd(), True) parcelHelper.scrape_parcel_owners(df["ParcelNumber"], f"raw/owners_{nrows // incr}", False) diff --git a/requirements-conda.txt b/requirements-conda.txt new file mode 100644 index 0000000..95900eb --- /dev/null +++ b/requirements-conda.txt @@ -0,0 +1,69 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +# created-by: conda 25.5.1 +_libgcc_mutex=0.1=main +_openmp_mutex=5.1=1_gnu +beautifulsoup4=4.13.5=py313h06a4308_0 +blas=1.0=mkl +bottleneck=1.4.2=py313hf0014fa_0 +brotlicffi=1.0.9.2=py313h6a678d5_1 +bs4=4.13.5=py39hd3eb1b0_0 +bzip2=1.0.8=h5eee18b_6 +ca-certificates=2025.9.9=h06a4308_0 +certifi=2025.8.3=py313h06a4308_0 +cffi=1.17.1=py313h1fdaa30_1 +charset-normalizer=3.3.2=pyhd3eb1b0_0 +defusedxml=0.7.1=pyhd3eb1b0_0 +expat=2.7.1=h6a678d5_0 +idna=3.7=py313h06a4308_0 +intel-openmp=2025.0.0=h06a4308_1171 +ld_impl_linux-64=2.40=h12ee557_0 +libffi=3.4.4=h6a678d5_1 +libgcc-ng=11.2.0=h1234567_1 +libgomp=11.2.0=h1234567_1 +libmpdec=4.0.0=h5eee18b_0 +libstdcxx-ng=11.2.0=h1234567_1 +libuuid=1.41.5=h5eee18b_0 +libxcb=1.17.0=h9b100fa_0 +libzlib=1.3.1=hb25bd0a_0 +mkl=2025.0.0=hacee8c2_941 +mkl-service=2.4.0=py313h5eee18b_3 +mkl_fft=1.3.11=py313hacdc0fc_1 +mkl_random=1.2.8=py313h8928b4f_1 +ncurses=6.5=h7934f7d_0 +numexpr=2.11.0=py313h41d4191_1 +numpy=2.3.3=py313h720eef7_0 +numpy-base=2.3.3=py313h95072fd_0 +odfpy=1.4.1=pyhd8ed1ab_1 +openssl=3.0.17=h5eee18b_0 +pandas=2.3.2=py313h280b501_0 +pip=25.2=pyhc872135_0 +pthread-stubs=0.3=h0ce48e5_1 +pycparser=2.23=py313h06a4308_0 +pysocks=1.7.1=py313h06a4308_0 +python=3.13.7=h7e8bc2b_100_cp313 +python-dateutil=2.9.0post0=py313h06a4308_2 +python-tzdata=2025.2=pyhd3eb1b0_0 +python_abi=3.13=1_cp313 +pytz=2025.2=py313h06a4308_0 +readline=8.3=hc2a1206_0 +requests=2.32.5=py313h06a4308_0 +setuptools=72.1.0=py313h06a4308_0 +six=1.17.0=py313h06a4308_0 +soupsieve=2.5=py313h06a4308_0 +sqlite=3.50.2=hb25bd0a_1 +tbb=2022.0.0=hdb19cb5_0 +tbb-devel=2022.0.0=hdb19cb5_0 +tk=8.6.15=h54e0aa7_0 +typing-extensions=4.15.0=py313h06a4308_0 +typing_extensions=4.15.0=py313h06a4308_0 +tzdata=2025b=h04d1e81_0 +urllib3=2.5.0=py313h06a4308_0 +wheel=0.45.1=py313h06a4308_0 +xorg-libx11=1.8.12=h9b100fa_1 +xorg-libxau=1.0.12=h9b100fa_0 +xorg-libxdmcp=1.1.5=h9b100fa_0 +xorg-xorgproto=2024.1=h5eee18b_1 +xz=5.6.4=h5eee18b_1 +zlib=1.3.1=hb25bd0a_0 diff --git a/requirements.txt b/requirements.txt index 5794235..1bed897 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,32 @@ +beautifulsoup4==4.13.5 +Bottleneck==1.4.2 +brotlicffi==1.0.9.2 +certifi==2025.8.3 +charset-normalizer==3.3.2 +defusedxml==0.7.1 +idna==3.7 +mkl_fft==1.3.11 +mkl_random==1.2.8 +mkl-service==2.4.0 +numexpr==2.11.0 +numpy==2.3.3 +odfpy==1.4.1 +pandas==2.3.2 +pip==25.2 +pycparser==2.23 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +pytz==2025.2 +requests==2.32.5 +setuptools==72.1.0 +six==1.17.0 +soupsieve==2.5 +typing_extensions==4.15.0 +tzdata==2025.2 +urllib3==2.5.0 +wheel==0.45.1 argon2-cffi==25.1.0 argon2-cffi-bindings==25.1.0 -certifi==2025.8.3 cffi==2.0.0 minio==7.2.16 -pycparser==2.23 pycryptodome==3.23.0 -typing_extensions==4.15.0 -urllib3==2.5.0