From 866fa5ebbaa22e71b751a480adb5d5bc2c1f2e68 Mon Sep 17 00:00:00 2001 From: Linnea Date: Mon, 22 Sep 2025 20:17:10 -0700 Subject: [PATCH 1/4] cleanup --- README.md | 17 ++-- experiments/aemp.org | 8 ++ experiments/gre_apartments.ods | Bin 23441 -> 23880 bytes processors/corp_owners.py | 174 ++++++++++++++++++++++++--------- processors/gre-llc.py | 34 +++---- processors/merge.py | 5 +- processors/parcel_owners.py | 46 +++------ processors/scrape.py | 42 ++------ requirements-conda.txt | 69 +++++++++++++ 9 files changed, 249 insertions(+), 146 deletions(-) create mode 100644 requirements-conda.txt diff --git a/README.md b/README.md index 88926d4..82fb549 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # aemp-seattle -Initial repository for building up Seattle database for anti-eviction mapping. +Initial repository for building up Seattle database for anti-eviction mapping. -Modelled off the [evictorbase pipeline code](https://github.com/antievictionmappingproject/eb-data-pipeline). +Modelled off the [evictorbase pipeline code](https://github.com/antievictionmappingproject/eb-data-pipeline). -Relevant but not 1-1 walkthrough of how to programmatically find building owners: [350 Seattle BEPS Repo](https://github.com/BenBagBag/350_seattle_building_ownership/blob/main/How%20to%20Find%20Building%20Owners.ipynb). +Relevant but not 1-1 walkthrough of how to programmatically find building owners: [350 Seattle BEPS Repo](https://github.com/BenBagBag/350_seattle_building_ownership/blob/main/How%20to%20Find%20Building%20Owners.ipynb). [AEMP Seattle Community Agreements](https://docs.google.com/document/d/1ZMeRmPWmhxynBXZ-aV6R2sQBktjNYRL9Xw9PHpkVpJE/edit?usp=drive_link) @@ -18,8 +18,9 @@ Relevant but not 1-1 walkthrough of how to programmatically find building owners - `to_load/`: directory for files that can be loaded directly into the PostgreSQL database - `experiments/`: directory for Jupyter notebooks for data exploration and script development -## Data Inputs: -[eRealProperty](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): King County assessor data for finding the owner of a given parcel. -[Washington State Corporations and Charities Filing Database (CCFS)](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): For looking up a parcel owner name and finding the related business listing and related info. - -TODO: Find a good source for eviction filing data. Those with access can refer to the [potential data source list](https://docs.google.com/spreadsheets/d/1Ew0UrZvP-S74velkWSKaiSGBYcxIAoRH6IGpEzNWX6s/edit?gid=0#gid=0) to find new data sources. \ No newline at end of file +## Data Inputs: +[King County Assessor:](https://info.kingcounty.gov/assessor/DataDownload/default.aspx) Download records of all apartment complexes in King County. +[eRealProperty](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): King County assessor data for finding the owner of a given parcel. +[Washington State Corporations and Charities Filing Database (CCFS)](https://kingcounty.gov/en/dept/kcit/data-information-services/gis-center/property-research): For looking up a parcel owner name and finding the related business listing and related info. + +TODO: Find a good source for eviction filing data. Those with access can refer to the [potential data source list](https://docs.google.com/spreadsheets/d/1Ew0UrZvP-S74velkWSKaiSGBYcxIAoRH6IGpEzNWX6s/edit?gid=0#gid=0) to find new data sources. diff --git a/experiments/aemp.org b/experiments/aemp.org index 674a7c8..76b826f 100755 --- a/experiments/aemp.org +++ b/experiments/aemp.org @@ -85,3 +85,11 @@ Can then determine which one to use ** TODO: Do some data cleaning to have names be the same eg. Seattle city of is #1 and #4 most common property owner names, should be standardized eg. LLC, LLP, L.L.C. etc. all to one format + +* 28 August +From Dox: 610 Harvard Ave East +Seattle, WA 981027 +Intense management mispractice, would like to know more about the above address. +** TODO: At some point, cross-reference with registered rental data +https://data.seattle.gov/Built-Environment/Rental-Property-Registration/j2xh-c7vt/about_data +lf diff --git a/experiments/gre_apartments.ods b/experiments/gre_apartments.ods index 4f79295e08ee4b4996678d402cc94d3ac74482ae..e3c45168d38818ddc83b53dcb4bec9c758048890 100644 GIT binary patch delta 7485 zcmaKRWmH_t)@|cWkYK^Bk>DQO1HmC^XxxIkJBS+i04z1n;PN3qpn%3huhGcV%B6>#yzL0Is&Dl|3j6um?-ao%w9NVeP zTeuj_%0igf=H(G4ObYAIu3Yz)G?j@|BwkBO*CRgbV6ZQOsa08qp_rN%piiEL;j@*Q z7m$d=w-Mat?3U@^#p>co;-zG3R}LoP4id;x(#Q!-b{Z2V@3HPp4-(=kj-97Yn^7Lu z7VLzxqnLt|sMSm}Wu9NRl;sPxveUa7=xYAZe*qz6n|PIBXqstEm@2fDlXp(4=Nna) z(_7X?$1yqEmd}+rG&=}$Hb{M8Ixtx@q$-)g2z6FqDA`WNjtM;gEbV?`+?SCt-+Wm@ z(Bx(**HhM))Xc7UfHi18U9uq6Irfka$IZ+VW;8D8!;guW2*~I*RAfXUK@%W~m@(CW zEFHzD7KnjTuv>HIIf9Z+-#DJ9EEVktE#xWu za;rpgWjRJ8rX7Z89&fg0>lQ4i1nS3Ut%RggO(-}ro)#N$DQqUDo2S5+h%3UV zBXB^~<{Z=^Od*2^1TlmW=rJvccuvn-@=zkUh&a>gJH++2JEF3r53954T=VYn%16i{ zul4xPF!GmyxIn=(qK^=@00aV+^_F_xjN^5xV0f@d=a=8UVzH;=N!jp}E@WWJ~zhGEE@ zei99>tNLlz<<%96v6u*5dD$D9oJ_PO6p5FT%M*f<_rb6jMlfti+xMKa*ir%-N{hf= zXBBMxVzVPHz^KHQXKxz=#x!gd(*$uqm|X@_uxRVVoG~)~J_9Xs+HxHb6ZZ5EL-I>G zaxG=%ie~m9?q%};r?{RRkXN&NtFZuHbs;RO>zvcZy_4FBl2$N!m?B;sRHZ3gh>RCE zO0GlSG3uK;wL^P0j4QCCAEv{^k4|tBQOToOPQ-oRT_v-3mK;c|@)AH=2q77lJ+F)f zY$4$UQiE`iJR|m)U6ymJ1uc+meT@pA`XE##$~Ny@fN5-<7ZyP0zA0I9Gu=p(s*L?M zo}%46VfvxZQZK@;TdxF&-wm;gI(97BeFx-u_a~i}N=TWi2nZWIXiWWK>^4YBD&8t*mHuNv<<-I2W9S zX>zJGsB%h~YBM&jy$hlyaawm+6V#&Wu_Oc2R3_#ZS7Y1jZ>HI7Az5NAzGu33=bSZ* z@rR1VEM7MWDc5FFxmT5aoFN82-BwMpd1#S-&4at?C73Xq<1`vn(=5~8W`{h2tE)TX zgi5vxS*ME{;Ha}8jR*xBOa6TVnLa%!6dM!fDWRHRY{F&b^uQOc)WTVTx>o$Q$YqpU zu6(~rw@%1z3;c?6Y6BO*r4I(Mj~fZhDzN_N;vOs zuz-Jk_;1TTLk0MDzzDWiyZ+E;toF04@3WyeE2elKhaoyQ z;4MioV^X2F5AYFbUT_N!$~zM4X@1hX#J1xm+=P;eWrVawEMF0N(=Ywx7-LlNM9^p2q3E zFuGtQ7cC;YQ#p#)Tu&0{!oQzfU4)-6_>J~P7cfvBKC)$-t|yEpFA_T@@rdG--?i zdwgg!Ui^A@$(uzj;E2(8N3J~6VuewiAf8*`yhIqYaBL+~s(a?G&54Ql>ZMm{98X)M zbYTHHeQ1svChAWxJ5tMXMgBLoGYhlpFDaze^N{Fwcro0JPROiVM--|8ZfFFBK+A-{ z;2z+46Ki>MIt??(j2*u*n@ui$o--_Kp0NA4 z>ke_@0)=lq>0lo-%=g;qP-rd^UhH=w((iQxD>bLY=F`N@Lv6?fLYR)`6o*z$3-h{W z(hX`H9C7rqKp`C`1%9U!uSRped>=U8$w#|rxNA>4T0#AAmqvy2b+ui)#<25hIkvdS z3MD_qKVx`5wWvP0skcAI$ZXz(73)`;+X_TP?m6%NDWa`zRh3y4ygV@%eSvh+*|t-~ zqniGt^G>B-uB%U_>k410%r8=7`Ei=y2-We%_m_AON?H98Ox(?Mar&E;x>SSt8_W;a zTK?XZU6Q^;Z7L(kDhcF!wpalz$dAuLF+5>EbMCMG%|5(e{WVjIYPr~+s?>CB+^D0D znmOpx1WbJ#H5!*HGc62IEcqGLez@VOcFxyoXduy<+M-u;jMxT0j>oB?I_!9KRQUdg zrmBbr9KS%CsYM3>tOWspzcu8)WL-TwrYOQ6sRwnyqJ*ru6bcY_ZIq~TY3a2SAtOBI z3TRMfx{1IylY6t5GCpFUyw9`=Vw8`AwtFY6rDZZ?;2A5Cuf*c>XTE1wH#-!D`Q6{S z#RZBz9fIgKDS0TTODhg6RS^U(N~xqTtGV4d=2I_b2gN53?FhYEb~GGGw&hAGW0B*K zy@2uq!Vs>0a#_U4fsv#)6j5G1!nn`4PJuO2X5>}&!c+D_@~aJNJF5f+<{;_SnTB~A z_a^Qs#!TDq(op~Mp69Ri3Sr|nBkT*S*$ zqSfy3#pA>sHm8U=>TSd~ll@OOS*IFUctYtdr7VrCYdlIV zV>>xhTMG`in~rTH+j5v3>yL3~#daCf#D`YOs`JAi%w%jUF(15=zlvnNlEXb-DziR3 zofsaK@!6ElR+}$0iIH}aZ3?NzmNBT6Fiz#*MK7rN$)9sFLG&?yz87?;3Qb4EKc&Wh zjtk-N>BG)p9Zt2788LB*WGs`Gm}|qIE&hOGMmWqajaNU3 z|9sGUXFD<3@EBRYfsjmW;}xM^;$h_j;qif`3?=>=uAf(YV(eYq#o{v>= zX-Rc;_Iwp`PtzeB{qWJHwi@dzW?y56i41N2WJ0(Wm2ap~R>{DHMZ#mMNZTl5OFt_l zgJv@?f+cWcYH^yAI^JL}m$CR^h)sXFG?1a1-Y|OySl@HZ?DxXy5g8Y)PtPsIcOY20 ze%Zu^@5L*0)cbBfR22wEv%N8iOT7N(*{c)OTgo?9abVTpEmS}5)a5SsspT26S5V%d>e9VRWr++N2{(@e%}TslxNeP)uP|DeJr+%kEqnntQd z)Z$EQbBq)vr%@M9t#C_pK)!9EIjAgd%S+a{Gn}E7$TMZ!X+kMm{fh=j7T7zBXsr5M*~%S+-sy(!3;MvqV$bzZ)yuf#4*U@ z(vUwa0%LrRc%t^BF6J2~TjV!VO%n<0wjW58_*d!YBON0y!V&v6a{jAYK#jpQ*K%ti z_^O%$XVe@l3+|z}6eiEnj``eKJTd^D641Fs`y@AZ zDAR`Eh^}?+7tb^60JPl3)6L73OD6uGOus0mY9G~eW2!v_l40<7c1m&(!$+FBfpV`H zXYH+Elo@@5W83d>UD21jq@+fEAHh8`AhE@|Fjsrwfy0m=en#c>U(*W%F;3#OaV=85 zz;8IRcNlK(E@0tjEO&D3z$iSf+oPBCH@n;HeICMs)_WHgv*xt)osci9G?@T!x*t;>xTOlpBF#TjmOL zIl9KLj=DpO{RA55N-L&yN^E~EAGbf<|EhF+INv$7MB0gmobG#G^ zEQH<3t&YqIMO}OJE!FSxyq;M&oYsnInh9)dGD&Y=Dv}Xw#(L<49DGo_Pr)G7%IsF^ zp9@051iRJ*?kEg4=s7BJEpEah&D)!I&NmxgbK&QwQz>{!c1SL|Ui!Z30|D8Wt5k1z+yNHOwlopif78bLssXk)&)8`rIi5(x5Tpm*t)1 zq*$HVMiIOAay73ycw6AgsMy3Sf{}^B0RGNMTRnW3jh6<5iQ;Ayaf*rlre^DUy3w%r zB*BR+(uvGbt)(V=hQ%#DC@*j;Vf0DQ_-rxvNqK>o3)kcek>tbg4ib-yR3M2A@+YxR z^R(Cz&x!XHX#{{Y)VZ9UAD<>Qif0@ofXaodAZX#n7>Sfzk7O=;M(Gc)vSXZ8-8ft2 zlQc@MjbD&^t5_K+w5y@nNDLqK&Bau>P|3u^g!J}AygSrxP8pc*kjesey%L@opDvu& z9>TsYT6{W*fxG+bS9FL}nMdBDJQFZ*%W%y;pX(H=!qkhz)N|yT?UIRbFI=#bT(e8j z*hYO+`6Q6fqC?pp9e6k2#W}59isZ`ewZnngvj_3AdG7Pw$u6Ygd`|EjBgBIy^T*Jr z;Sw1Dycq9o0yqvIOKYYjWk{EmV}!3^RFNr~lPKa<()%nCIdB&FbL* zBL7Z4+=72>(TK`$Xv=q<);;||Gn|U8n`B83*r}ybF8Giw*XT(_Em`pkEEUfC4KWVo z$^!Ce^L$~$)WNm5$*gsAY}(nkWWu&B=b6ltQ^rK-b(6!ws6`Xzqss0!J7)_>Zum{r zk1z`fCycCP6Pk2>#+0d4IHISYZAh1S`@-&ki@Z4bjQF=ST6n6O<7+Vg=vofnnI}!! zM(}Wn-McdNk3Lwh_-kIo`Nyb5f&<8wL(U<)7Xzg7-6X$f$-GYuog51ne8$0-i=nu64yqE_Qxi*T5k>kaPY&8b|fe?`8RKXUt9fe>9FB;2+O# z0Kned&4k^{-Y!-{IevwjpyNgx>{i1U#w$}BYgaQTHQ)cPYOZolPkOUTF&fn8mHqg0 z)M!aLQ_8W!B+UCV@Mvo_;{e64Cc6q(UY#H)Ma6n$;lTQdi+x(^vf5>83ffCZ0J+EQ zi!>97Gvi)om~2ijJ`{DR#giQ1ApLS-;Irn+B6Q%Lu$tZDK=_a{25U;y z9f_W|KC|Fc#|sIPpEFCs>2RKsSutuBqBYmq_Vz3kg0T;jMbn=W()KI*n@r6=$$%~S z!@-M=s5$u-nctHe%k~Gph5(nBAmB&ieP7NpO!B>sHY$j1&#_X1up|mHWpzex|!TEL~OXPn^{sE}CCW^Y0m* zyfAEG1E0|dY=|5chI6=jDCxY$Pn~X;wf}9|_|^)a)E!{$I9+^a7!D*+);<-(TrN$o z=Br^~$4=8;M#FWZYKXBAi6{&LxAlgR;x+{0#Vmd9(3kYgs^k-R(^7UePrs!{Rd9#& z=Po_!3F-Dt5JRYQ zp}@WQaL-0IYyni^MlJCudCAdRcjaEO*6jAfA9p&01Sl^c%%Rm~RfJ^BegnLmQcPVE zdw6#ra6Uvr6(5P=9^bA8P@+&!jp8(-WJMf(zqV(m^l+(82hKK=f1eH60=ze9b*Dts zg zHU@(&9nQ*<=)t+o?XG`*u!>V8vK6dshB!AZk6mnl)NttP1i@Gg1Cxqbm-jI}4>~qQ z_QwJGX87{t&*T7n-e8Dsx27`?M&GA-_*$5y8}*K9lZ?2+G$DLI18P4l`|;h+HXc8s z2=S%S`;q1vs{KdKo-L3GC2P+L9m*w~ipc;;De{&Uv%dpo@5GcsJJ>jjl}3yY(7Fpm@ssB(SDLXOuzfg z^b?)vZpo0R!?HrB|44_qllFT+=MZUQZ_0KrWsqzZngzLqUmb;ny;7m_T=Gm*sy6Dk z@PVccY4Hhk2c_E%+Tr3ACxMbcox|6WKo?RpR|fq+%f8oZ6B02@xf_A;YXZ>=#2OjU zZ+PFR$4i`~h#Me+8m{ccCZy1yR%Q zZIP|^VLl<7sl914^vCX`sA5OYW&w$~L8*zqGeetx0>dOhaTtq05jJ54sM|$d@DYyh zagB~1q4UuCy0vEyjxhC(9(?7BAsp3B zJe8ZrC+JU~UNoxxHoklwIxAXgfk4$1Z4RAfG#1MBMDV4K0woDa;R~bW-Rv5CN|G=c z2`Ebmz(vKVDuI#&x^7>F&44T;VR{UNP@6m;?4}`TY zi&(5p9vJWBr6=tlCW|1E`abgyOJb72_g=n^f;i||;Ga`hcQ{3UVmP5?EZ<*oxdsuc z-hTkavY~I|vFHQ*K6&A5@E3s1g^nN2CeR7LX;DKyiU3iD2N>X;tX3%4UUGQjnA!=d z^fK=xxD%4(e@L3BiRAOwdVby@ctB;jK<{faD?9q`%(3LZ!AFFBE@C{z-0Vwqi^?P3 zSL2&l!AuS|d`m%p#iPnbYZ*_05xi6ZZk1;1?(RYmGm(!Ojh!}K&ph_LzB--Z`26nv zg-r$o@RJjV6BX3cPJZ>{8^ZUIv3ct(uHJ&YzCBW8#m)z6)9R?NP6ojIw`F%?ENSc= zo9!Swn-7%loW?q1dIFO1KbpNoJ8_nYmYbZN!;+>#)1mA_!bEm{{*K3!hNlM<(X<(a zz`QK7xU}QZ)MtiXGKAIlIYN;LWA0adrS}{<i9<}Bz3=0d71O% zrUVYuj6!q1G zkjE;f4HxBd;-a;!7dRT#!)%{hG=N;aisZ`DAzb}NUXQLjd(w2gk+O0n6-K)Bxn zB2xbKz`67ZE_%hw(Azq<&z~k|C<^k9?(5DvGTlcNa&H%+N*A?{)QFa)Xpai_7vh|0 zoOfvWUeOquYgSYAwnOE3`KV!O_HWJnwVJbsn5Weun!{OZ$3wHwjf-jxCMV=mGe3T8)Fl!y8 zx#eP7yhC^ei|@TiP4x1wYKQDqy{pFu&)I|DFMUgprLjsuk4ZX#Dh0#YkBg5}6t?Q@ z(bL&aH|TwqSv3S+uQYej-)xmwaA|m%d>RSG-hsEOO|oc>C9{k$C;|KY-5u+s9az|W zQ6}81=9yRyj___aV?$4>LQdDP_jpz!_Ox$M$8`9aIUW)IB$75R+e!*l007GL??m#i zN?H9T`MAIzyRN10H7>7jMTqZ`2S3P06@3E%zybWzgI=zgMI`v{MGxzV21!DL!kU0 z>fbQLPY6&f@XP<;mJt%G{uieA8-|CN=tN8O_s;%h_vrn{2!fvJu>8e9sY`|+02R_D z1OM6TKSdPAfBmNy764%8=-_7V;Pyv5sVe?A@jo;6AC2{Yj~=v2mq_#v`9CiM^Zx+9 zzj%lN09SK2H){t=*MGAAyR3k}iQmNvMQ00@B^xskDG}ACS$VJ3JtVLkdcFHwco_B`Mt{(j`y* z?!E8(?jLWwHO3xm@44olYwk72p6j>fueTADPzS|JNwrzaAeZ)RX;@m~miW zVq*TD0B#Is0s2RmQ$BlIU^JyVmN7nk`}*2NLMd?oPX?ROVR)^K!+ZN(v5U2<`y+Yt z*7J)4S_KXN%bilXZ-g!|iiL|&oh{$+M{HfZix+KvHxz8QdUK1-mIk53)v5-rPY9C4 zdwm0gz4QoJrkLcw8dx7QR@=9c60k002RNced8$}?0*ENR=C&`HiheA~w${CpQlE3jJJb@0(0$OlfKHe zM&I1gY({LTNNKgoZ$@#9<+5?C2dDFcr5_roiy) zm^&eT?xQ@sOp241sfkG1wQ%}acnpWCemI$2gjMC@gx*7k#iU(&4$4_?udKb9^mNFxwZjw@a#| zDkM)Clrgr?RT{ap99lUCB|d3Z14l{-eyB&Gu?|;9kY#pop{`3L!Yt&&ycEB$PgaER z7IH$@$q%MeU3Bv@vOSi;yXh!oO(22TlTRzwHge+m`ypN^kyV)JwPie}w$-PaqsH(M#W0^Q@8DEYrgCL16rG6|$6Q@VT@PQIr-FydOoBVA zU0et1F`82BB6x~?qkY9VWw@zgV1rNfYeJ7k5eU6gty4=-^{L9s_Y#kBLm@9X1GS>` z@2%B^qdq(}y~G%rv5kAl0{F*tzQ_U5ICm60mkHBbj#`hxzA$9N()1Mz!{wW<<*Qrb z|5C|@^;8B#-76>r6Y01#Z8Y5`C}Gl7>Yid!ocr(Z9>BQ@X(KlIqw!>U4_uj&Al)@x(Zs#0RE@lrX2M|67(i9wY1z~Dr zm!8oda%AkS?TBhJ-vK+huSW)pW6it|3Y}%G%eGdJ_1Fxh%Fc!?C)q_j`?|6jztdC; zG;d`OG!g_6fT9#XU%c5LPKf~N{6n3m>5qV=CmQA$BIfT zsg57+v`WvolZeN&UK@qufx04M(hl!xW1JMU5hm zQDZ?^4DAkfCAbU^T~2jA8BXK*xqfX5xwMq%JsFN(olha2*`&_*m!HU~cXART$6eHm zoBTX8&S~s@Qk_!&LA&#NRN~G(7^R@WthkA1`AxJqpJy$wsGmg!hZ*{9i+{Y@ToE5U z{wYE<9(UfVus1q!V7rM`cX4`eGffU~y%P64*v_DOUM4~%0eK+0A=^o|wzQy;x57#j zlhfl@^6mL;LV*zx$H~fwQLwe}!i7`L$bGEvi6v1yF=nNK)3W&-@J!K5o+GirqQs}@ z#CoGEzCm69l?Q^)D2g@Y=9#P4AT*)HKg5&br;k8?HtNs2HH0K}vkz>wC08rgFYpqaP?CQ>It z&{Z5+wD}0}s=WEWkN}0;Bk&%^tF-zyR~dc-%KbTLd%aKA0h^yxf(7Q3Dhj>h$~B#N zRYu&8;l)91+EtVq(!$10H_YYnwRc~_3pP*VU@TYm?hIXkysjU4%*W=Tw`0OH_vl`W z@j$N`a09&S+<@?`Hm{)=eoe@kf zKR+|NPkb5ad&!+C&_8L9>^}?39l=4HTTH!EU{iok`u&vR@O&Cm@^B%WnpN{jdfC;k z*&ym{)ATvZBj@0?EK~bmTLb~S1l8pnC(cPGXSME=?tNMj8)sOpS^X5ik`oqL<*+E^ zrBT5yD4`|Zrdw}Pf&^Rd?do}RMr(ii?nO~|hpMiGf$4Uf4Ir^lQ0@z&p!@|8|ApVG zWN<`4z%L+Ygz~p`|JS{RlcT$pqdU8|!|Mg3FRrW4$QFk6vJiy@Dfd+3C!!+P6X{1C zK3erNY3?g)8niPqR!}nP(^<|m`SlJAuPbRTDOIx-FdGe4q=&GAhg!~;T%0-N&o0&Q zL7;O)mng39TPQkHfK)QnzaH39;MvqXkQXz#Sxp<-Y>jH7@z4X|HSCSWuf_+31q8eB zn+tS5loNKTG&f7mo{npJi0wST?wBkqFyPQM?9iw%U%e1xk$(PxX;VW_V`}$w#!xKh z^?Ly=nUg*$F=r+@4#Z}H$tzq*6N4u-EAp-9M^|C12gwoc6|a1GDhm^6 z%;|><_QvQW=i30LDHIe$ozn3d#lxDlKFM65EMX`#j$3=h80cq;h0WDv0;C9aIVZz)*Z;Z<2i2E=se zgO6_M!_Y;2_7v`<31fiyj*l@!-YbSn>-X@uju$dW%5DYx1H=6e*nP)D%>A;n~s1aTJ%uBx13y zQ2C#t6mX?Udi`#(gn~WIN^D8yIyU}(!W=^D`5rFB_yKk2R*_%WX5w-NqvR$H&&BMy zbPRa-IdvyK@lU4bVfD5iE?UFS_jOkVT%jgwCp^=yw|f%oC*)M^$!A^Gkf%>sgUj7; z1Z&msqf6@m1Xm;bxMGv^D$#Wm(*ua9L)20jMVUFO#j z#Um(rpDm9}N?4AyqYhn%QXN?;A{L7Y9k{UkCf%?fc3&`y=0G91@8&>ok~iJ+@Cg%n z$Pc%ncV_Z)oXb|p%dq|6^YoX9_~yrw%JU!Lv_`AIFhSpG%{<3+Qta%eoZXa*TuYCN zbu788Y4c~qi?lrAC;dN26@KJXS+H+B&76xFZlHy7OL!qZi`d4>M}Nd&sKw}vB9&0? z#$RFKzz#IYuTc|zz{T-xFTI-QG8HslF%y`Dge1#7JsTx%QVtNxQrPKzrtmzuFYz$V z@?)hMurG8m(H1n^?e+a~|B@v2Ht}cd=Cogdn(S90s>owc)@P4CJOKy|%u>W;>mPTe zT`o(4FZsOW7Q1|0abws;Res?^luTBG&W1|rl@p2UjK>oO_R4TG=^fggm(I3rbUVlE z9C*umL`25s&N1~(#UAu*{Vpx6{y~VSh@=m_KoBhH>Cx@o#03xiw=1q<52W_AF5ifI zsrX$K78@}+wT^wOWmE2?yeDHfcPF7Nm9L0`?`i2RFCVGBD=NEOGP8!gb>NZrcOvA< z@hV7YCSrsf%0S?OdF5(P|ez7|xZIlF&g+-%mu2{mASI<9@I$OuIBD8$hZZM)h zkyE9xt12%g&J?EPS6nVnOmn^b@uCVzdH+I4BWJs$>sU}+%6WU(r$tYv*G@|)950L~{H9fs&q61&k|sr-KrhMGxobDUO3zDiz38vL&Ad98jw*9pc~eSph(rH3 zygjQ4xajbi3DBE|-|fd;_r^CiJn4_ldKwjKk$S0~A`8gP7_DFJ zV_j2Pemf2|hylGn3~WmAhS>Onu`jnN`#uT5Pjo{6`3aSe7BOfl$eP8?-L$sLTUz)PuO+ph1gM-M z&hM4`)I)uQvMX+7!Y?AtoTa)jKGuAFL;1>fic^V67GuY-ghV#8xRoarxn#{38h)G2K~(&(#YqEx)p&_e#gN3}RVRo7P7xauEwN3n6Whrw(GH%>mgDu_We#HPI%WaaE@AHD|cCfU^>7eWbNocy;m zo|Wk^JJf(?Q@`S$IUBcE{a{2n=)@``$04BW`907egOP|KgTwP;l;lyVyfqPoGy0qP$llQOj^TqE)E7BfI0!)e$8XX%^zre0!eM_KbEx%Bk5b)aN*0=U_f% z>z-d(dWi;-fCQGLYCAWzWp>BMK5BAY;2IZa2-Z`@qn;-4B9L}Jh~D?vP2%Q*VlRK@ zqUqtHa3bC`QpRfNda6qwej(F&0Z^1gDj4T{GCGhh^L3QL5W#=jL_9qh*yegUKc&g) zQ}G=4+kmX1IVWcXH0=@dw|bkL{X_fokI`JhhmP8t^C1Er6>AITmfi&%dkt5Mn7ex$ z1iM#*?K~5&$I~mHO{VEX2_%Fdtb|$bgEzRCEszIV-N%8O29n2t32#YdLV&U-EKHYj z%uH$Tgcq5thPY+1-|BhiJT1*~AE}L*T-vNc(q1RN`BIr#)FXC%X!RqGcZ9zQe~;9( z{;B?VdV$6H-JGyPH&^B3;72}^P6svALt!u056Zt}^GA2~HQB_2HWTOlD5^EMOdL$r zjH%e2s`2T!L|;)1dK1W@R=5iV;qobtf)zQfJF#m64+IPxI3?-}zBjRUd~KCFe()Q_ z+E*SjWq?pnlu`dHig0mqT3|tKRe%6!YXh%SUw!IfC*3bhT;HN*)6uGLdgBJSHDC75 zt{AVr{+gLqjj7o2j#qY;RLz0vnt%1kPsrv4S;BNGDsyb*ZZ#fqh(;d&4SP_j8-K(6 z-Jaj}txISX#d+3hPO>!JX)Cy-0cYYHJ&~q|a3T+ldwQ@XM#@Sd17LtwnA3vSP51T7 zb2S?;Zu84tN$1NE56O~M)trRfZ3(D%012;QUS3|z-aeFtUR0HLTF_=@&9lm9qBoP) z#zt+9yLQ&9dwS=Nc$7-8rG{#1?l8UN?Css28*W0v;5WNzg5Brg(!J}N_<;4 zGFU}PM;^ZZNxn;sV*5v{4Tf8`wajK5oM#f!?qPaPar6Y6{c!taor ziW387k=M83UDw7}HJ?RlUYM#x>`_o6NxLfF%#Lq~- z^T=j$xJaQa%u0eVu$Vs(vXlz;15v?mH23K;8SeCQ(9c8$H-V+@z+Eml=ykuS^q6Xy z39jFu^Pp~3D0OndNo@ToG>&ga&@M(JrhSnxr;nRmu##H*Fh%~E29K=KU?KNQ5c!-x z>Fo#ldWiYe<0F+iE*X;c598yB;xeo8rr~Hm(a5em@py?5rc&+ARum-^#D(+~RMYF+ z1kQs@#@Q3hV?fY7P1Hmp;TZS4_9Ql%pgLPQk9Z1nBCH{-X(=EV=1BCufzxP}PBCtm zH1J-KT8`4pE*sA4UF<3p4I$LPw>x$vjEgsE3`@F`k%W zKGM59>oaS5W6YK8n~NmFJBus;MKZp%P%aC&6!#3b9CGf+B^wFbc{ch;z5dIqNXZ6u z$_&qkZw>ZI@g>cO6xf5x#suzI61F!#E2pI|A((MLPSv8qsJ!!anKJpXE!3a99lOe% z{ML%_qizMtduK?$spN`gy6N=$t1NX5=;@?WwYDwwn<#QHCf>+3ywj}Om*}{2kblHuv0}`a0b0~!4Q$Aa zK3Nd}YpuxtP%$-4Y4O=SOR1yxUixd#ZTqzvf(&K>!RMQAM_nQB#1tzXmVmCe16FQ` zUc+?zN45&onmX+ z8cV4n!_l#Ikt=nY_ogjZGy8^x)$xwa7p}Ps8fPjRhrsVVCGi0!4xpi+Br*OwPyLH} zsY<395co}?R9Wk}f&P#x^ZI`n7`VC-1pd^31TGFHgvT49|C_9VLk%II1h@u>_!nn` z!wdqGp!|p1`NxT!{BNgs6qLR{ZyXu^#q#{NarZ}p1pWVQ@Wa&&AOG=j#{dF$0Kv&Y zbh!U;;J0(&JW#?C4Sc}>`#-9SpoCmHmVapeAFfh>6P&3Y{H6V`nuO7Rsul1&BUakq z#fj?kS7YeEQh(5|)Q|IzPkjhKGJ^ck`THjSyMXb(%?!?JOsVtR&fm9?UwGheO_mG= z#lgzm?03CGUFo0x{5_!moShPmG=|Xqw%j?nN}D2pg7W+NXE4U8;FHFW{w&W=V+c4~ P2M#fLj83Nad-DGQzGng0 diff --git a/processors/corp_owners.py b/processors/corp_owners.py index 07e300c..00513c6 100644 --- a/processors/corp_owners.py +++ b/processors/corp_owners.py @@ -1,5 +1,10 @@ """ - Utility functions for extracting owners. + Utility functions for + 1. LookupCompaniesHelper: looking up a parcel owner in the WA Corporations and Charities Database, + and extracting the best search result. + 2. GroupCompaniesHelper (WIP): given a company's stated governors and addresses, + groups together addresses that likely share the same landlord. + """ import pandas as pd @@ -11,9 +16,6 @@ import re # import geopandas as gp import urllib.parse -# Utils for finding principals - - search_for_business_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' # search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' principal_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList' @@ -40,11 +42,30 @@ def get_business_search_payload(business_name, page_count, page_num): } def get_business_details(business_id): - """ Get business details from the Corporation and charities filing database. """ + """ Get business details from the Corporation and charities filing database. + """ url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}" # url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id) - r = requests.get(url) - return json.loads(r.text) + if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")): + # print("found json") + with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: + return json.load(f) + else: + r = requests.get(url) + # Try to read the response text + try: + r_json = json.loads(r.text) + except: + r_json = {} + + try: + # TODO: Will this write an empty string if no actual request result? + with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: + str_json = json.dumps(r_json) + f.write(str_json) + except: + pass + return r_json class LookupCompaniesHelper: @@ -53,26 +74,67 @@ class LookupCompaniesHelper: def _get_empty_df(self): return pd.DataFrame([], columns = ['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', - 'Address', 'Status', 'address_match', 'ubi_match', 'id_match']) + 'Address', 'Status', 'address_match']) - def _get_business_search_results(self, business_name, page_num): - r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num)) - try: + def _get_business_search_results(self, business_name_orig, page_num): + business_name = business_name_orig.strip() + no_result = True + result = {} + while no_result and len(business_name) > 0: + print(f"searching with name {business_name}") + r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num)) + # TODO: add back the try-catch, but with better recovery this time + # Seems like it's more of a network issue than didn't find anything + if r.status_code == 429: + # TODO: Raise an error instead + print("This IP address has likely been blocked by CCFS, try using a vpn") result = json.loads(r.text) - #return json.loads(r.text) - except: - result = {} + if len(result) > 0: + no_result = False + else: + # Strip off the last word from the search term and try again next iteration + try: + # Get the index of the last space in the name + last_space = business_name[::-1].index(" ") + business_name = business_name[: -1 - last_space].strip() + except ValueError: + # TODO: In this case, try with the LastBuyer in stead of ListedOwner? Upstream + print(f"Found no business with name {business_name_orig}\n") + business_name = "" + + return result def _extract_search_results(self, search_term, search_req_response): - res_list = [[search_term, res['BusinessName'], res['UBINumber'], res['BusinessID'], - res['PrincipalOffice']['PrincipalStreetAddress']['FullAddress'], res["BusinessStatus"]] - for res in search_req_response] - res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status"]) - # Basically keep a list of exact matches, and build a list of potential matches that we give to human verifiers + # TODO: If no results, return a row with the search term and nans for everything else + res_list = [] + for res in search_req_response: + # build up the known responses + # get more business data from that id + business_info = get_business_details(res["BusinessID"]) + res_list += [[search_term.strip(), + res.get('BusinessName').strip(), + res.get('UBINumber'), + res.get('BusinessID'), + res.get('PrincipalOffice')['PrincipalStreetAddress']['FullAddress'], + res.get("BusinessStatus"), + business_info.get("BINAICSCodeDesc", "NOT_FOUND")]] + # return an empty row if no search results + if len(search_req_response) == 0: + res_list += [[search_term, "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND", "NOT_FOUND"]] + + res_df = pd.DataFrame(res_list, columns=['SearchTerm', 'BusinessName', 'UBINumber', 'BusinessId', 'Address', "Status", "BusinessNature"]) + + # Clean some of the results a bit more: + # Keep only active companies and searches that yielded no results + res_df = res_df[(res_df["Status"]=="Active") | (res_df["Status"]=="NOT_FOUND")] + # TODO: Maybe keep only real estate / property investments? + + # Keep a list of exact matches, or later build a list of potential matches that we give to human verifiers exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist() if exact_match: res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0) + return res_df def _determine_search_matches(self, search_results_df): @@ -82,24 +144,21 @@ class LookupCompaniesHelper: and result have the same address. Could add search terms as a subset for duplicated call """ search_results_df['address_match'] = search_results_df.duplicated(subset=['Address'], keep=False) - search_results_df['ubi_match'] = search_results_df.duplicated(subset=['UBINumber'], keep=False) - search_results_df['id_match'] = search_results_df.duplicated(subset=['BusinessId'], keep=False) def _get_all_company_name_match_search_results(self, owner_name): n = 1 res_length = 100 search_results = [] - while res_length == 100: - res = self._get_business_search_results(owner_name, n) - search_results += (res) - n += 1 - res_length = len(res) + # while res_length == 100: + res = self._get_business_search_results(owner_name, n) + # search_results += (res) + # n += 1 + # res_length = len(res) - return search_results + return res """ - TODO: Remove the ubi and address match, this does nothing to help """ def _get_potential_company_name_matches(self, owner_name): all_search_results = self._get_all_company_name_match_search_results(owner_name) @@ -111,40 +170,59 @@ class LookupCompaniesHelper: """ utils to separate search results into exact match, potential match (where no exact match was found), and additional matches (extra matches if there was an exact match and additional matches) + TODO: Give more robust answers here! Other abbreviations include: + - Apartment: APTS -> Apartments + - Partnership + - etc. """ def is_exact_match(row): """ Extract exact matches, including some regex magic. """ search = row["SearchTerm"] result = row["BusinessName"] - + # examples: LLC, LLP, L L C, L.L.C., L.L.C. L.L.P., L.L.P, LLC. # Limited Partnership, Limited liability company p = re.compile("L[\s.]?L[\s,.]?[PC][.]" ,flags=re.IGNORECASE) - result=result.replace(",", "") + + replace_map = { + ",": "", + "LIMITED LIABILITY COMPANY":"LLC", + "LIMITED PARTNERSHIP": "LLC", + "APARTMENTS": "APTS", + "LTD PS": "LLC", + "LTD PARTNERSHIP": "LLC", + } + result= re.sub(p, "LLC", result) - result=result.replace("LIMITED LIABILITY COMPANY", "LLC") - result=result.replace("LIMITED PARTNERSHIP", "LLC") - - search=search.replace(",", "") search=re.sub(p, "LLC", search) - search=search.replace("LIMITED PARTNERSHIP", "LLC") - search=search.replace("LIMITED LIABILITY COMPANY", "LLC") + + for k,v in replace_map.items(): + result = result.replace(k, v) + search = search.replace(k, v) + + # result=result.replace(",", "") + # result=result.replace("LIMITED LIABILITY COMPANY", "LLC") + # result=result.replace("LIMITED PARTNERSHIP", "LLC") + + # search=search.replace(",", "") + # search=search.replace("LIMITED PARTNERSHIP", "LLC") + # search=search.replace("LIMITED LIABILITY COMPANY", "LLC") return search == result exact_matches = self._get_empty_df() - exact_matches.columns potential_matches = self._get_empty_df() - additional_matches = self._get_empty_df() + # additional_matches = self._get_empty_df() exact_match = results[results.apply(lambda row: is_exact_match(row), axis=1)] - if len(exact_match) > 0: - exact_matches = pd.concat([exact_matches, exact_match], ignore_index=True) - additional_matches = pd.concat([additional_matches, results[results['SearchTerm'] != results['BusinessName']]], ignore_index=True) + # TODO: If going to do len(results) check, then need to filter by business nature sooner + # Len results heuristic doesn't work for empty searches, or the recursive search + if len(exact_match) > 0: #or len(results) == 1: + exact_matches = pd.DataFrame(results.iloc[0]).T else: potential_matches = pd.concat([potential_matches, results], ignore_index=True) - return exact_matches, potential_matches, additional_matches + return exact_matches, potential_matches def get_company_list_name_matches(self, owner_list: list): """ @@ -152,19 +230,19 @@ class LookupCompaniesHelper: owner_list: a list of owner names that will be searched in the CCFS database for matches. Exact_matches: when search term exactly matches a result in CCFS database. Potential_matches: when search term doesn't exactly match, there needs to be some human verification here to determine. - Additional_matches: extraneous matches in case potential_matches didn't yield enough results. """ exact_matches = self._get_empty_df() potential_matches = self._get_empty_df() - additional_matches = self._get_empty_df() + # TODO: Instead of additional matches, make a df for "no matches" for owner in owner_list: + owner = owner.strip() # Clean owner name slightly matches = self._get_potential_company_name_matches(owner) - temp_exact, temp_potential, temp_add = self._separate_search_results(matches) + temp_exact, temp_potential = self._separate_search_results(matches) exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True) potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True) - additional_matches = pd.concat([temp_add, additional_matches], ignore_index=True) - return exact_matches, potential_matches, additional_matches + # additional_matches = pd.concat([temp_add, additional_matches], ignore_index=True) + return exact_matches, potential_matches def get_company_matches_and_export(self, owner_list: list, x: int): @@ -261,7 +339,7 @@ class GroupCompaniesHelper: return principals business_ids = [res['BusinessID'] for res in search_results] business_names = [res['BusinessName'] for res in search_results] - ubi_nums = [res['UBINumber'] for res in search_results] + # ubi_nums = [res['UBINumber'] for res in search_results] for id, name in zip(business_ids, business_names): business_json = get_business_details(id) diff --git a/processors/gre-llc.py b/processors/gre-llc.py index 509c4d2..1e7638f 100644 --- a/processors/gre-llc.py +++ b/processors/gre-llc.py @@ -5,44 +5,36 @@ Created on Fri Aug 15 19:06:45 2025 @author: linnea +Script to + Address: 308 4th Ave S, Seattle, WA, 98104 ParcelNumber: 5247801370 ListedOwner: GRE DOWNTOWNER LLC PreviousBuyer: CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY - - + GRE List: https://goodmanre.com/our-projects/ TODO: - Make a flag that shows if the buywer / owner are similar - - Check the fuzzy wuzzy matching in utils - Get the address field from CCFS, put in corp_owners - If the previous buyer doesn't make sense, get the year of the last buying to see if it's at all recent for sanity checks - -1. Load in the whole dataframe of owners and buyers -2. Get the whole list of responses for the listed owner - - This shows all the companies that match the listed owner in assessor data - - Need to find the most likely company in CCFS to match the listed owner -3. Make a df out of? -4. """ from corp_owners import LookupCompaniesHelper, GroupCompaniesHelper import pandas as pd lookup_helper = LookupCompaniesHelper(("../data/intermediates")) -df = pd.read_csv("../data/intermediates/owners_listed.csv") -# Almost never need additional matches, as it's only populated if there's an exact match -exact, potential, additional = lookup_helper.get_company_list_name_matches(["GRE DOWNTOWNER LLC"]) -owner_names = df["ListedOwner"].unique() -# exact, potential, additional = lookup_helper.get_company_list_name_matches(owner_names[:10]) +# Option 1: Uncomment the two lines to run the full script. +# df = pd.read_csv("../data/intermediates/owners_listed.csv") +# owner_names = df["ListedOwner"].unique() + +# Option 2: Uncomment two lines to run with a specific subset for debugging +df = pd.read_excel("../experiments/gre_apartments.ods", engine='odf') +df = df.iloc[1] +owner_names = [df["ListedOwner"]] + +exact, potential = lookup_helper.get_company_list_name_matches(owner_names) -if(len(exact) >= 1): - ubi = exact.loc[0, "UBINumber"] - -group_helper= GroupCompaniesHelper("../data/intermediates", "principals") -# TODO: Figure out how to format the url for proper response -res_group = group_helper.get_companies_principals(exact) diff --git a/processors/merge.py b/processors/merge.py index 50302e0..8cce5ba 100644 --- a/processors/merge.py +++ b/processors/merge.py @@ -2,9 +2,12 @@ # -*- coding: utf-8 -*- """ Created on Tue Aug 12 18:17:47 2025 - @author: linnea +One-time script for cleaning up parcel lookup data. +If everything went 100% smoothly in scrape.py (ie. no search results came back empty), +then this script shouldn't be needed. + 1. Load intermediate results 2. Load original data with unmodified parcelid 3. Add a taxparcelid to unmodified so can merge diff --git a/processors/parcel_owners.py b/processors/parcel_owners.py index e1663e5..a82d1b5 100644 --- a/processors/parcel_owners.py +++ b/processors/parcel_owners.py @@ -1,3 +1,8 @@ +""" + Utils for finding a parcel owner given an address. + Data source is King County Assessor. +""" + import pandas as pd from bs4 import BeautifulSoup import requests @@ -43,13 +48,7 @@ class ParcelLookupHelper: if data_not_found: return None return html_soup - - # TODO: Maybe include sales history AND current owner? - # Example: 308 4TH AVE S 98104 - # Current owner = GRE DOWNTOWNER LLC - # Sales history = CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY - # Website also shows GRE DOWNTOWNER https://www.addisononfourth.com/ - # TODO: cache the whole soup object so can lookup later? + s def _get_owner_name_from_soup(self, soup: object): """ Extract the owner name from a given BeautifulSoup object, `soup`, of a Property Detail page. @@ -60,34 +59,8 @@ class ParcelLookupHelper: parent = title.parent next_tr = title and parent.find_next('tr') table = next_tr and next_tr.table - return table and table.find_all('td')[5].text - - def _get_num_units_and_types_from_soup(self, soup: object): - """ - Given a BeautifulSoup object, `soup`, of a Property Detail page, extract: - - the number of units in the building - - the unit types - - the sq ft of each unit type - - number of bed/bath rooms in each unit type - """ - title = soup.find('span', text = 'Unit Breakdown') - if not title: - return { 'numUnits': 'NOT_FOUND', 'unitDetails': 'NOT_FOUND' } - - table = title and title.find_next('div').table - table_rows = table and table.find_all('tr')[1:] - cells = table_rows and [row.find_all('td') for row in table_rows] - table_data = [] - - for c in cells: - table_data.append([span.text for span in c]) - total_units = sum([int(row[1]) for row in table_data]) - dict_keys = ['type', 'number', 'sqft', 'bed', 'bath'] - units = [dict(zip(dict_keys, row)) for row in table_data] - return { 'numUnits': total_units, 'unitDetails': units } + return table and table.find_all('td')[5].texts - - # TODO: pass maybe a list of features want to extract? def _scrape_parcel_owners(self, tax_parcel_id_numbers: list, file_name: str): @@ -110,6 +83,10 @@ class ParcelLookupHelper: self._write_parcel_owner_csv(parcel_df, file_name) def _save_html(self, soup, id): + """ + Given a 'soup' type response for an address lookup, save + as an html file for future lookups. + """ table = soup.find("table", attrs={"class":"_table2", "id":"TABLE1"}) with open(f"{self.output_path}/html/{id}.html", 'w') as f: f.write(str(table)) @@ -122,7 +99,6 @@ class ParcelLookupHelper: return "NOT FOUND" else: return self._get_owner_name_from_soup(parcel_soup) - # parcel_df.loc[len(parcel_df.index)] = [id, owner_name] def _scrape_parcel_owners_and_unit_details(self, tax_parcel_id_numbers: list, file_name: str): diff --git a/processors/scrape.py b/processors/scrape.py index 8ed090b..cd8e2e6 100644 --- a/processors/scrape.py +++ b/processors/scrape.py @@ -1,13 +1,15 @@ +""" +Script for getting all apartment addressses in King County, +and looking up their parcel owners in KC Assessor. +Only runs in set increments to avoid being blocked by Assessor site. +""" import pandas as pd -from selenium import webdriver -from selenium.webdriver.common.by import By -import time import os from parcel_owners import ParcelLookupHelper -incr = 998 +incr = 998 # Number of addressses to look up per run. -class ParcelScraper: +class ApartmentDataLoader: def __init__(self, path): self.path = path # path to the csv # self.driver = self.load_driver() @@ -35,31 +37,6 @@ class ParcelScraper: if idx % 50 == 0: print(f"Saving row {idx}") self.df.to_csv("apartments_with_owners.csv") - - def submit_parcel(self, parcel): - self.driver.get("https://blue.kingcounty.com/Assessor/eRealProperty/default.aspx") - print(f"https://blue.kingcounty.com/Assessor/eRealProperty/Dashboard.aspx?ParcelNbr={parcel}") - self.driver.get(f"https://blue.kingcounty.com/Assessor/eRealProperty/Dashboard.aspx?ParcelNbr={parcel}") - parcel_name = "" - try: - # parcel_form = self.driver.find_element(By.ID, "cphContent_txtParcelNbr") - # parcel_form.send_keys(parcel) - - # search_box = self.driver.find_element(By.NAME, "kingcounty_gov$cphContent$btn_Search") - # search_box.click() - - # Wait until the table view has loaded - # table_loaded = self.driver.find_element(By.ID, "topnavlistbtn") - # wait = WebDriverWait(self.driver, timeout=5) - # wait.until(lambda _: table_loaded.is_displayed()) - - name = self.driver.find_element(By.XPATH, "/html/body/form/table/tbody/tr/td[2]/table/tbody/tr[2]/td[1]/table/tbody/tr[2]/td/div/table/tbody/tr[2]/td[2]") - parcel_name = name.text - # print(name.text) - except: - print(f"Couldn't find parcel name for parcel number {parcel}") - - return parcel_name def get_parcel_number(self, major, minor): return str(major).rjust(6, "0") + str(minor).rjust(4,"0") @@ -73,10 +50,9 @@ if __name__ == "__main__": pass print(f"starting at index {nrows}") - - scraper = ParcelScraper("EXTR_AptComplex.csv") - df = scraper.df.loc[nrows:nrows + incr] + loader = ApartmentDataLoader("EXTR_AptComplex.csv") + df = loader.df.loc[nrows:nrows + incr] parcelHelper = ParcelLookupHelper(os.getcwd(), True) parcelHelper.scrape_parcel_owners(df["ParcelNumber"], f"raw/owners_{nrows // incr}", False) diff --git a/requirements-conda.txt b/requirements-conda.txt new file mode 100644 index 0000000..95900eb --- /dev/null +++ b/requirements-conda.txt @@ -0,0 +1,69 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +# created-by: conda 25.5.1 +_libgcc_mutex=0.1=main +_openmp_mutex=5.1=1_gnu +beautifulsoup4=4.13.5=py313h06a4308_0 +blas=1.0=mkl +bottleneck=1.4.2=py313hf0014fa_0 +brotlicffi=1.0.9.2=py313h6a678d5_1 +bs4=4.13.5=py39hd3eb1b0_0 +bzip2=1.0.8=h5eee18b_6 +ca-certificates=2025.9.9=h06a4308_0 +certifi=2025.8.3=py313h06a4308_0 +cffi=1.17.1=py313h1fdaa30_1 +charset-normalizer=3.3.2=pyhd3eb1b0_0 +defusedxml=0.7.1=pyhd3eb1b0_0 +expat=2.7.1=h6a678d5_0 +idna=3.7=py313h06a4308_0 +intel-openmp=2025.0.0=h06a4308_1171 +ld_impl_linux-64=2.40=h12ee557_0 +libffi=3.4.4=h6a678d5_1 +libgcc-ng=11.2.0=h1234567_1 +libgomp=11.2.0=h1234567_1 +libmpdec=4.0.0=h5eee18b_0 +libstdcxx-ng=11.2.0=h1234567_1 +libuuid=1.41.5=h5eee18b_0 +libxcb=1.17.0=h9b100fa_0 +libzlib=1.3.1=hb25bd0a_0 +mkl=2025.0.0=hacee8c2_941 +mkl-service=2.4.0=py313h5eee18b_3 +mkl_fft=1.3.11=py313hacdc0fc_1 +mkl_random=1.2.8=py313h8928b4f_1 +ncurses=6.5=h7934f7d_0 +numexpr=2.11.0=py313h41d4191_1 +numpy=2.3.3=py313h720eef7_0 +numpy-base=2.3.3=py313h95072fd_0 +odfpy=1.4.1=pyhd8ed1ab_1 +openssl=3.0.17=h5eee18b_0 +pandas=2.3.2=py313h280b501_0 +pip=25.2=pyhc872135_0 +pthread-stubs=0.3=h0ce48e5_1 +pycparser=2.23=py313h06a4308_0 +pysocks=1.7.1=py313h06a4308_0 +python=3.13.7=h7e8bc2b_100_cp313 +python-dateutil=2.9.0post0=py313h06a4308_2 +python-tzdata=2025.2=pyhd3eb1b0_0 +python_abi=3.13=1_cp313 +pytz=2025.2=py313h06a4308_0 +readline=8.3=hc2a1206_0 +requests=2.32.5=py313h06a4308_0 +setuptools=72.1.0=py313h06a4308_0 +six=1.17.0=py313h06a4308_0 +soupsieve=2.5=py313h06a4308_0 +sqlite=3.50.2=hb25bd0a_1 +tbb=2022.0.0=hdb19cb5_0 +tbb-devel=2022.0.0=hdb19cb5_0 +tk=8.6.15=h54e0aa7_0 +typing-extensions=4.15.0=py313h06a4308_0 +typing_extensions=4.15.0=py313h06a4308_0 +tzdata=2025b=h04d1e81_0 +urllib3=2.5.0=py313h06a4308_0 +wheel=0.45.1=py313h06a4308_0 +xorg-libx11=1.8.12=h9b100fa_1 +xorg-libxau=1.0.12=h9b100fa_0 +xorg-libxdmcp=1.1.5=h9b100fa_0 +xorg-xorgproto=2024.1=h5eee18b_1 +xz=5.6.4=h5eee18b_1 +zlib=1.3.1=hb25bd0a_0 -- 2.49.0 From 93b81bfb00c4139e306d4c826630224ac601a1a4 Mon Sep 17 00:00:00 2001 From: Linnea Date: Mon, 22 Sep 2025 20:22:03 -0700 Subject: [PATCH 2/4] add pip list --- requirements.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..36e00ca --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +beautifulsoup4==4.13.5 +Bottleneck==1.4.2 +brotlicffi==1.0.9.2 +certifi==2025.8.3 +cffi==1.17.1 +charset-normalizer==3.3.2 +defusedxml==0.7.1 +idna==3.7 +mkl_fft==1.3.11 +mkl_random==1.2.8 +mkl-service==2.4.0 +numexpr==2.11.0 +numpy==2.3.3 +odfpy==1.4.1 +pandas==2.3.2 +pip==25.2 +pycparser==2.23 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +pytz==2025.2 +requests==2.32.5 +setuptools==72.1.0 +six==1.17.0 +soupsieve==2.5 +typing_extensions==4.15.0 +tzdata==2025.2 +urllib3==2.5.0 +wheel==0.45.1 -- 2.49.0 From c82df4b0fb88109db5213c8fab5278f5faa8c299 Mon Sep 17 00:00:00 2001 From: Linnea Date: Thu, 25 Sep 2025 14:26:45 -0700 Subject: [PATCH 3/4] Confirm minio access --- README.md | 2 +- lib/minio_helper.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e12722b..0e3373a 100644 --- a/README.md +++ b/README.md @@ -35,4 +35,4 @@ Use `lib/minio_helper.py` to extend the functionality Run `test_minio` in `lib/main.py` to test out that it works (TODO: move this to own testing script, perhaps unit tests) -Note: You will need to have access_key and secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys) +Note: You will need to have minio_access_key and minio_secret_key in your env before running for this to work, contact @linnealovespie or @ammaratef45 to obtain these keys) diff --git a/lib/minio_helper.py b/lib/minio_helper.py index ba6392b..8d3c447 100644 --- a/lib/minio_helper.py +++ b/lib/minio_helper.py @@ -7,8 +7,8 @@ class MinioHelper: def __init__(self, bucket_name: str): self.client = Minio( "minio.radmin.live", - access_key=os.environ['access_key'], - secret_key=os.environ['secret_key'] + access_key=os.environ['minio_access_key'], + secret_key=os.environ['minio_secret_key'] ) self.bucket_name = bucket_name -- 2.49.0 From fae15e05b131947aa39e85fed22a921e5ea83568 Mon Sep 17 00:00:00 2001 From: Linnea Date: Thu, 25 Sep 2025 14:39:15 -0700 Subject: [PATCH 4/4] clean up comments and todos --- processors/corp_owners.py | 39 ++++++++++++--------------------------- processors/gre-llc.py | 7 +++++-- processors/merge.py | 3 --- 3 files changed, 17 insertions(+), 32 deletions(-) diff --git a/processors/corp_owners.py b/processors/corp_owners.py index 00513c6..56f40b8 100644 --- a/processors/corp_owners.py +++ b/processors/corp_owners.py @@ -13,10 +13,10 @@ import requests import json import os import re -# import geopandas as gp import urllib.parse search_for_business_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' +# Old search URL, holding onto in case the above gets blocked # search_for_business_url = 'https://cfda.sos.wa.gov/api/BusinessSearch/GetBusinessSearchList' principal_url = 'https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/GetAdvanceBusinessSearchList' @@ -45,9 +45,9 @@ def get_business_details(business_id): """ Get business details from the Corporation and charities filing database. """ url = f"https://ccfs-api.prod.sos.wa.gov/api/BusinessSearch/BusinessInformation?businessID={business_id}" + # Old search URL, holding onto in case the above gets blocked # url = 'https://cfda.sos.wa.gov/#/BusinessSearch/BusinessInformation?businessID={business_id}'.format(business_id=business_id) if(os.path.exists(f"../data/inputs/principals_json/{business_id}.json")): - # print("found json") with open(f"../data/inputs/principals_json/{business_id}.json", 'r') as f: return json.load(f) else: @@ -83,10 +83,9 @@ class LookupCompaniesHelper: while no_result and len(business_name) > 0: print(f"searching with name {business_name}") r = requests.post(search_for_business_url, get_business_search_payload(business_name, 100, page_num)) - # TODO: add back the try-catch, but with better recovery this time - # Seems like it's more of a network issue than didn't find anything + # TODO: add some more error handling in case of connectivity issues. if r.status_code == 429: - # TODO: Raise an error instead + # TODO: Raise an error print("This IP address has likely been blocked by CCFS, try using a vpn") result = json.loads(r.text) if len(result) > 0: @@ -98,7 +97,7 @@ class LookupCompaniesHelper: last_space = business_name[::-1].index(" ") business_name = business_name[: -1 - last_space].strip() except ValueError: - # TODO: In this case, try with the LastBuyer in stead of ListedOwner? Upstream + # TODO: In this case, try with the LastBuyer in stead of ListedOwner? print(f"Found no business with name {business_name_orig}\n") business_name = "" @@ -106,7 +105,6 @@ class LookupCompaniesHelper: return result def _extract_search_results(self, search_term, search_req_response): - # TODO: If no results, return a row with the search term and nans for everything else res_list = [] for res in search_req_response: # build up the known responses @@ -128,9 +126,12 @@ class LookupCompaniesHelper: # Clean some of the results a bit more: # Keep only active companies and searches that yielded no results res_df = res_df[(res_df["Status"]=="Active") | (res_df["Status"]=="NOT_FOUND")] - # TODO: Maybe keep only real estate / property investments? + + # TODO: Maybe add a filter on BusinessNature for only real estate/ property investments + # TODO: First need to get an idea of all the BusinessNature types # Keep a list of exact matches, or later build a list of potential matches that we give to human verifiers + # This check is very simple heuristic and more robust matching will occur later in processing exact_match = res_df.index[res_df['BusinessName'] == search_term].tolist() if exact_match: res_df = pd.concat([res_df.iloc[[exact_match[0]],:], res_df.drop(exact_match[0], axis=0)], axis=0) @@ -150,12 +151,7 @@ class LookupCompaniesHelper: res_length = 100 search_results = [] - # while res_length == 100: - res = self._get_business_search_results(owner_name, n) - # search_results += (res) - # n += 1 - # res_length = len(res) - + res = self._get_business_search_results(owner_name, n) return res """ @@ -199,20 +195,11 @@ class LookupCompaniesHelper: for k,v in replace_map.items(): result = result.replace(k, v) search = search.replace(k, v) - - # result=result.replace(",", "") - # result=result.replace("LIMITED LIABILITY COMPANY", "LLC") - # result=result.replace("LIMITED PARTNERSHIP", "LLC") - - # search=search.replace(",", "") - # search=search.replace("LIMITED PARTNERSHIP", "LLC") - # search=search.replace("LIMITED LIABILITY COMPANY", "LLC") return search == result exact_matches = self._get_empty_df() potential_matches = self._get_empty_df() - # additional_matches = self._get_empty_df() exact_match = results[results.apply(lambda row: is_exact_match(row), axis=1)] # TODO: If going to do len(results) check, then need to filter by business nature sooner @@ -233,7 +220,7 @@ class LookupCompaniesHelper: """ exact_matches = self._get_empty_df() potential_matches = self._get_empty_df() - # TODO: Instead of additional matches, make a df for "no matches" + # TODO: Make a df for search terms with no matches and how to make it mesh well with recursive search for owner in owner_list: owner = owner.strip() # Clean owner name slightly @@ -241,7 +228,6 @@ class LookupCompaniesHelper: temp_exact, temp_potential = self._separate_search_results(matches) exact_matches = pd.concat([temp_exact, exact_matches], ignore_index=True) potential_matches = pd.concat([temp_potential, potential_matches], ignore_index=True) - # additional_matches = pd.concat([temp_add, additional_matches], ignore_index=True) return exact_matches, potential_matches @@ -251,11 +237,10 @@ class LookupCompaniesHelper: match CSV's in the folder determined by `output_path` """ print(f"Saving output files to {self.output_path}") - exact_matches, potential_matches, additional_matches = self.get_company_list_name_matches(owner_list) + exact_matches, potential_matches = self.get_company_list_name_matches(owner_list) exact_matches.to_csv(f'{self.output_path}/exact_matches_{x}.csv') potential_matches.to_csv(f'{self.output_path}/potential_matches_{x}.csv') - additional_matches.to_csv(f'{self.output_path}/additional_matches_{x}.csv') class GroupCompaniesHelper: def __init__(self, out_path: str, out_name: str): diff --git a/processors/gre-llc.py b/processors/gre-llc.py index 1e7638f..52989f0 100644 --- a/processors/gre-llc.py +++ b/processors/gre-llc.py @@ -5,13 +5,16 @@ Created on Fri Aug 15 19:06:45 2025 @author: linnea -Script to +Script to find exact and potential search results for a parcel owner in the CCFS database +A representative example for the parcel owner (assessor) data scraping step Address: 308 4th Ave S, Seattle, WA, 98104 ParcelNumber: 5247801370 ListedOwner: GRE DOWNTOWNER LLC PreviousBuyer: CENTRAL PUGET SOUND REGIONAL TRASNSIT AUTHORITY - + +We happen to already know the answer, +which is this address is part of Goodman Real Estate's extensive portfolio GRE List: https://goodmanre.com/our-projects/ TODO: diff --git a/processors/merge.py b/processors/merge.py index 8cce5ba..ef619a0 100644 --- a/processors/merge.py +++ b/processors/merge.py @@ -69,9 +69,7 @@ if __name__ == "__main__": # Add address from df_apts to df_raw df_join = df_apts.merge(df_raw, 'left', on="ParcelNumber") df_join["ListedOwner"] = "NOT_FOUND" - # df_join["ListedOwner"] = df_join.apply(lambda row: get_listed_owner(row), axis=1) for idx, row in df_join.iterrows(): - # df_join.loc[idx, "ListedOwner"] = get_listed_owner(row) row.ListedOwner = get_listed_owner(row) df_join.loc[idx] = row if idx % 500 == 0: @@ -79,7 +77,6 @@ if __name__ == "__main__": df_join.to_csv(f"{intermediates_path}/owners_listed.csv") df_join.to_csv(f"{intermediates_path}/owners_listed.csv") - # df_join = df_join.rename(columns={"Owner":"RecentBuyer"}) -- 2.49.0