add example script for extracting eviction court filing data #12

Merged
linnealovespie merged 2 commits from linnealovespie/evictions into main 2025-10-20 20:09:15 +00:00
2 changed files with 4251 additions and 0 deletions

4215
experiments/gre.html Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,36 @@
import pandas as pd
from bs4 import BeautifulSoup
def get_df(html_path):
"""
Given a local path to some saved html, return a dataframe with the extracted court filing data
1. HTML is extracted from King County Scripts Portal: https://dja-prd-ecexap1.kingcounty.gov/?q=node/411&199355=411110
2. Search an organization name, then use "inspect" to look at the raw html for the page.
3. Look for a <table> element with an id='form.SearPage[some random digits] and copy all internal html.
4. Save it to somewhere locally and note the path to pass to this function. (for example, save to data/input/[your chosen filename].html)
(One could alternatively use the requests library to automate this part, but we're restricted also by a CAPTCHA)
"""
f = open(html_path)
soup = BeautifulSoup(f)
trs = soup.find_all("tr", attrs={"class":"row1"})
cols = ["CaseNumber", "Filing Date", "CaseName", "Charge/Cause of Action", "NextHearing", "Status"]
rows = []
for row in trs:
# print(type(row))
entries = row.find_all("td")
for e in entries:
# print(e.text.strip())
entry_str = [e.text.strip() for e in entries]
s = dict(zip(cols, entry_str))
rows.append(s)
df = pd.DataFrame(rows)
return df
if __name__ == "__main__":
# Change this value to extract new data
html_path = "gre.html"
df = get_df(html_path)
# Change this output path or name as you see fit
df.to_csv("../data/intermediates/gre_evictions_2025.csv")