2025-10-20 20:09:15 +00:00
2 changed files with 4251 additions and 0 deletions
--- a/experiments/gre.html
+++ b/experiments/gre.html
--- a/experiments/scrape_evictions.py
+++ b/experiments/scrape_evictions.py
@ -0,0 +1,36 @@
+import pandas as pd
+from bs4 import BeautifulSoup
+
+def get_df(html_path):
+    """
+    Given a local path to some saved html, return a dataframe with the extracted court filing data
+    1. HTML is extracted from King County Scripts Portal: https://dja-prd-ecexap1.kingcounty.gov/?q=node/411&199355=411110
+    2. Search an organization name, then use "inspect" to look at the raw html for the page. 
+    3. Look for a <table> element with an id='form.SearPage[some random digits] and copy all internal html. 
+    4. Save it to somewhere locally and note the path to pass to this function. (for example, save to data/input/[your chosen filename].html) 
+    (One could alternatively use the requests library to automate this part, but we're restricted also by a CAPTCHA)
+    """
+    f = open(html_path)
+    soup = BeautifulSoup(f)
+    trs = soup.find_all("tr", attrs={"class":"row1"})
+
+    cols = ["CaseNumber", "Filing Date", "CaseName", "Charge/Cause of Action", "NextHearing", "Status"]
+    rows = []
+    for row in trs:
+        # print(type(row))
+        entries = row.find_all("td")
+        for e in entries:
+            # print(e.text.strip())
+
+            entry_str = [e.text.strip() for e in entries]
+            s = dict(zip(cols, entry_str))
+            rows.append(s)
+    df = pd.DataFrame(rows)
+    return df
+
+if __name__ == "__main__":
+    # Change this value to extract new data
+    html_path = "gre.html" 
+    df = get_df(html_path)
+    # Change this output path or name as you see fit
+    df.to_csv("../data/intermediates/gre_evictions_2025.csv")