KSA: Initial commit
This commit is contained in:
parent
681dfbd88a
commit
a8dab495fe
|
@ -0,0 +1,3 @@
|
||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
|
@ -0,0 +1 @@
|
||||||
|
parser_OLX
|
|
@ -0,0 +1,6 @@
|
||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
|
@ -0,0 +1,4 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
|
@ -0,0 +1,8 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/parser_OLX.iml" filepath="$PROJECT_DIR$/.idea/parser_OLX.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,10 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,89 @@
|
||||||
|
import sqlite3
|
||||||
|
from sqlite3 import Error
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logging.FileHandler('logfile.log')
|
||||||
|
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
|
||||||
|
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
|
||||||
|
|
||||||
|
|
||||||
|
class DbConnect:
|
||||||
|
|
||||||
|
def __init__(self, db_path):
|
||||||
|
self.connection = None
|
||||||
|
self.db_path = db_path
|
||||||
|
|
||||||
|
def get_connection(self):
|
||||||
|
try:
|
||||||
|
self.connection = sqlite3.connect(self.db_path)
|
||||||
|
logger.info("database connected successfully")
|
||||||
|
except Error as e:
|
||||||
|
logger.critical(f"error: {e}")
|
||||||
|
# print(f"An Error has occurred: {e}")
|
||||||
|
# return connection
|
||||||
|
|
||||||
|
def run_query(self, sql_query, row):
|
||||||
|
cursor = self.connection.cursor()
|
||||||
|
try:
|
||||||
|
cursor.execute(sql_query, row)
|
||||||
|
self.connection.commit()
|
||||||
|
logger.debug(sql_query)
|
||||||
|
logger.debug(row)
|
||||||
|
logger.info("SQL query run successfully")
|
||||||
|
except Error as e:
|
||||||
|
print(f" Query Failed……{e}")
|
||||||
|
|
||||||
|
def get_data(self, query):
|
||||||
|
col = []
|
||||||
|
cursor = self.connection.cursor()
|
||||||
|
cursor.execute(query)
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
for row in rows:
|
||||||
|
col.append(row)
|
||||||
|
return col
|
||||||
|
|
||||||
|
def get_notready(self):
|
||||||
|
col = []
|
||||||
|
cursor = self.connection.cursor()
|
||||||
|
cursor.execute("""SELECT
|
||||||
|
name, city_id FROM cities
|
||||||
|
WHERE city_id NOT IN
|
||||||
|
(SELECT city_id FROM olx_data
|
||||||
|
WHERE date=date('now'));
|
||||||
|
""")
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
for row in rows:
|
||||||
|
col.append(row)
|
||||||
|
return col
|
||||||
|
|
||||||
|
|
||||||
|
# con = DbConnect("/home/krzychu/PycharmProjects/OLX_selenium/test_table.db")
|
||||||
|
# con.get_connection()
|
||||||
|
# regions = con.get_column("regions", "name, region_id")
|
||||||
|
# cities = con.get_column("cities", "city_id, name, city_region_id")
|
||||||
|
|
||||||
|
# cities_div = {}
|
||||||
|
#
|
||||||
|
# for el in regions:
|
||||||
|
# # print(el)
|
||||||
|
# cities_list = []
|
||||||
|
# for i in cities:
|
||||||
|
# if i[2] == el[1]:
|
||||||
|
# cities_list.append(i[1])
|
||||||
|
# cities_div[el[0]] = cities_list
|
||||||
|
#
|
||||||
|
# print(cities_div)
|
||||||
|
|
||||||
|
# print(regions)
|
||||||
|
# print(cities)
|
||||||
|
|
||||||
|
# add_record = """
|
||||||
|
# INSERT INTO
|
||||||
|
# olx_data(city_id, adv_sale_count, adv_rent_count, adv_exchange_count)
|
||||||
|
# VALUES
|
||||||
|
# (4548, 629, 837, 7);
|
||||||
|
# """
|
||||||
|
|
||||||
|
# run_query(connection=con, sql_query=add_record)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,84 @@
|
||||||
|
#!/home/krzychu/PycharmProjects/parser_OLX/venv/bin/python
|
||||||
|
|
||||||
|
from db_con import DbConnect
|
||||||
|
from sel_source import SelRequest
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import csv
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
|
|
||||||
|
def removeaccents(input_text):
|
||||||
|
strange = 'ĄĆĘÓŁŚŻŹŃąćęółśżźń'
|
||||||
|
ascii_replacements = 'ACEOLSZZNaceolszzn'
|
||||||
|
translator = str.maketrans(strange, ascii_replacements)
|
||||||
|
return input_text.translate(translator)
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logging.FileHandler('logfile.log')
|
||||||
|
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
|
||||||
|
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
|
||||||
|
|
||||||
|
|
||||||
|
con = DbConnect("olx_data.db")
|
||||||
|
con.get_connection()
|
||||||
|
|
||||||
|
cities = con.get_data("""select c.city_id, c.name
|
||||||
|
from cities c left join
|
||||||
|
(select city_id from olx_data where date = date('now')) o
|
||||||
|
on o.city_id=c.city_id;""")
|
||||||
|
|
||||||
|
url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
|
||||||
|
cities_list = [(url_base + removeaccents(i[1]).lower().replace(" ", "-") + '/', i[0]) for i in cities]
|
||||||
|
|
||||||
|
with open('urls.csv', 'w', newline='') as f:
|
||||||
|
writer = csv.writer(f, lineterminator='\n')
|
||||||
|
header = ['rent', 'sale', 'exchange', 'city_id']
|
||||||
|
writer.writerow(header)
|
||||||
|
for i in cities_list:
|
||||||
|
start = time.time()
|
||||||
|
curTime = date.today()
|
||||||
|
row = SelRequest(i[0]).get_olx_stats()
|
||||||
|
row.append(i[1])
|
||||||
|
row.append(curTime)
|
||||||
|
record = """
|
||||||
|
INSERT INTO
|
||||||
|
olx_data(adv_rent_count, adv_sale_count, adv_exchange_count, city_id, date)
|
||||||
|
VALUES
|
||||||
|
(?, ?, ?, ?, ?); """
|
||||||
|
con.run_query(record, row)
|
||||||
|
end = time.time()
|
||||||
|
logger.info(f'executed in {end-start}')
|
||||||
|
|
||||||
|
|
||||||
|
# with open('first.csv', 'rb') as inp, open('first_edit.csv', 'wb') as out:
|
||||||
|
# writer = csv.writer(out)
|
||||||
|
# for row in csv.reader(inp):
|
||||||
|
# if row[2] != "0":
|
||||||
|
# writer.writerow(row)
|
||||||
|
|
||||||
|
# add_record = """
|
||||||
|
# INSERT INTO
|
||||||
|
# olx_data(city_id, adv_sale_count, adv_rent_count, adv_exchange_count)
|
||||||
|
# VALUES
|
||||||
|
# (4548, 629, 837, 7);
|
||||||
|
# """
|
||||||
|
|
||||||
|
|
||||||
|
# cities_div_urls = {}
|
||||||
|
# url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
|
||||||
|
#
|
||||||
|
# for el in regions:
|
||||||
|
# cities_list = [(url_base+removeaccents(i[1]).lower().replace(" ", "-")+'/', i[0]) for i in cities if i[2] == el[1]]
|
||||||
|
# cities_div_urls[el[0]] = cities_list
|
||||||
|
#
|
||||||
|
# reg_urls = [(url_base+removeaccents(el[0]).lower()+'/', el[1]) for el in regions]
|
||||||
|
|
||||||
|
# select c.city_id, o.city_id from cities c left join
|
||||||
|
# (select city_id from olx_data where date = date('now')) o on o.city_id=c.city_id;
|
||||||
|
|
||||||
|
# select name, city_id from cities where city_id not in
|
||||||
|
# (select city_id from olx_data where date=date('now'));
|
||||||
|
|
||||||
|
# select name from olx_data join cities on olx_data.city_id = cities.city_id where date = date('now');
|
Binary file not shown.
|
@ -0,0 +1,28 @@
|
||||||
|
async-generator==1.10
|
||||||
|
attrs==21.4.0
|
||||||
|
beautifulsoup4==4.11.1
|
||||||
|
bs4==0.0.1
|
||||||
|
certifi==2022.6.15
|
||||||
|
cffi==1.15.1
|
||||||
|
charset-normalizer==2.1.0
|
||||||
|
cryptography==37.0.4
|
||||||
|
h11==0.13.0
|
||||||
|
idna==3.3
|
||||||
|
lxml==4.9.1
|
||||||
|
outcome==1.2.0
|
||||||
|
pybrowsers==0.5.0
|
||||||
|
pycparser==2.21
|
||||||
|
pyOpenSSL==22.0.0
|
||||||
|
PySocks==1.7.1
|
||||||
|
python-dotenv==0.20.0
|
||||||
|
pyxdg==0.28
|
||||||
|
requests==2.28.1
|
||||||
|
selenium==4.3.0
|
||||||
|
sniffio==1.2.0
|
||||||
|
sortedcontainers==2.4.0
|
||||||
|
soupsieve==2.3.2.post1
|
||||||
|
trio==0.21.0
|
||||||
|
trio-websocket==0.9.2
|
||||||
|
urllib3==1.26.10
|
||||||
|
webdriver-manager==3.8.0
|
||||||
|
wsproto==1.1.0
|
|
@ -0,0 +1,66 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logging.FileHandler('logfile.log')
|
||||||
|
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
|
||||||
|
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
|
||||||
|
|
||||||
|
|
||||||
|
class SelRequest:
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self.options = Options()
|
||||||
|
self.options.add_argument('--headless')
|
||||||
|
self.options.add_argument('--no-sandbox')
|
||||||
|
self.options.add_argument('--disable-dev-shm-usage')
|
||||||
|
self.driver = webdriver.Chrome(options=self.options)
|
||||||
|
self.source = ''
|
||||||
|
|
||||||
|
def get_source(self):
|
||||||
|
self.driver.get(self.url)
|
||||||
|
logger.info(f'getting URL = {self.url}')
|
||||||
|
self.driver.execute_script("return document.body.innerHTML")
|
||||||
|
self.source = self.driver.page_source
|
||||||
|
self.driver.close()
|
||||||
|
return self.source
|
||||||
|
|
||||||
|
def get_olx_stats(self):
|
||||||
|
soup = BeautifulSoup(self.get_source(), "html.parser")
|
||||||
|
span = soup.find_all('a', {'class': 'css-pyvavn'})
|
||||||
|
lookfor = ["Wynajem", "Sprzedaż", "Zamiana"]
|
||||||
|
stat = []
|
||||||
|
for i in lookfor:
|
||||||
|
a = '0'
|
||||||
|
for el in span:
|
||||||
|
if str(el).find(i) > 0:
|
||||||
|
a = str(el)[str(el).find('>', str(el).find(i) + 20) + 1:str(el).find('<', str(el).find(i) + 20)]
|
||||||
|
try:
|
||||||
|
stat.append(int(a))
|
||||||
|
except ValueError:
|
||||||
|
stat.append(int(a.replace(u'\xa0', u'')))
|
||||||
|
logger.info(f'found data: {stat}')
|
||||||
|
return stat
|
||||||
|
|
||||||
|
|
||||||
|
# link = 'https://www.olx.pl/d/nieruchomosci/mieszkania/lubin/'
|
||||||
|
#
|
||||||
|
# options = Options()
|
||||||
|
# options.add_argument('--headless')
|
||||||
|
# options.add_argument('--no-sandbox')
|
||||||
|
# options.add_argument('--disable-dev-shm-usage')
|
||||||
|
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||||
|
#
|
||||||
|
# driver.get(link)
|
||||||
|
# innerHTML = driver.execute_script("return document.body.innerHTML")
|
||||||
|
# sc =
|
||||||
|
# driver.close()
|
||||||
|
#
|
||||||
|
# soup = BeautifulSoup(sc, "html.parser")
|
||||||
|
# # print(soup.prettify())
|
||||||
|
# span = soup.find_all('a', {'class': 'css-pyvavn'})
|
||||||
|
#
|
||||||
|
# print(span)
|
Loading…
Reference in New Issue