add example script for extracting eviction court filing data #12
4215
experiments/gre.html
Normal file
4215
experiments/gre.html
Normal file
File diff suppressed because it is too large
Load Diff
36
experiments/scrape_evictions.py
Normal file
36
experiments/scrape_evictions.py
Normal file
@ -0,0 +1,36 @@
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def get_df(html_path):
|
||||
"""
|
||||
Given a local path to some saved html, return a dataframe with the extracted court filing data
|
||||
1. HTML is extracted from King County Scripts Portal: https://dja-prd-ecexap1.kingcounty.gov/?q=node/411&199355=411110
|
||||
2. Search an organization name, then use "inspect" to look at the raw html for the page.
|
||||
3. Look for a <table> element with an id='form.SearPage[some random digits] and copy all internal html.
|
||||
4. Save it to somewhere locally and note the path to pass to this function. (for example, save to data/input/[your chosen filename].html)
|
||||
(One could alternatively use the requests library to automate this part, but we're restricted also by a CAPTCHA)
|
||||
"""
|
||||
f = open(html_path)
|
||||
soup = BeautifulSoup(f)
|
||||
trs = soup.find_all("tr", attrs={"class":"row1"})
|
||||
|
||||
cols = ["CaseNumber", "Filing Date", "CaseName", "Charge/Cause of Action", "NextHearing", "Status"]
|
||||
rows = []
|
||||
for row in trs:
|
||||
# print(type(row))
|
||||
entries = row.find_all("td")
|
||||
for e in entries:
|
||||
# print(e.text.strip())
|
||||
|
||||
entry_str = [e.text.strip() for e in entries]
|
||||
s = dict(zip(cols, entry_str))
|
||||
rows.append(s)
|
||||
df = pd.DataFrame(rows)
|
||||
return df
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Change this value to extract new data
|
||||
html_path = "gre.html"
|
||||
df = get_df(html_path)
|
||||
# Change this output path or name as you see fit
|
||||
df.to_csv("../data/intermediates/gre_evictions_2025.csv")
|
||||
Reference in New Issue
Block a user