KSA: Initial commit
This commit is contained in:
parent
681dfbd88a
commit
a8dab495fe
|
@ -0,0 +1,3 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
|
@ -0,0 +1 @@
|
|||
parser_OLX
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/parser_OLX.iml" filepath="$PROJECT_DIR$/.idea/parser_OLX.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,89 @@
|
|||
import sqlite3
|
||||
from sqlite3 import Error
|
||||
import logging
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.FileHandler('logfile.log')
|
||||
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
|
||||
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
|
||||
|
||||
|
||||
class DbConnect:
|
||||
|
||||
def __init__(self, db_path):
|
||||
self.connection = None
|
||||
self.db_path = db_path
|
||||
|
||||
def get_connection(self):
|
||||
try:
|
||||
self.connection = sqlite3.connect(self.db_path)
|
||||
logger.info("database connected successfully")
|
||||
except Error as e:
|
||||
logger.critical(f"error: {e}")
|
||||
# print(f"An Error has occurred: {e}")
|
||||
# return connection
|
||||
|
||||
def run_query(self, sql_query, row):
|
||||
cursor = self.connection.cursor()
|
||||
try:
|
||||
cursor.execute(sql_query, row)
|
||||
self.connection.commit()
|
||||
logger.debug(sql_query)
|
||||
logger.debug(row)
|
||||
logger.info("SQL query run successfully")
|
||||
except Error as e:
|
||||
print(f" Query Failed……{e}")
|
||||
|
||||
def get_data(self, query):
|
||||
col = []
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
col.append(row)
|
||||
return col
|
||||
|
||||
def get_notready(self):
|
||||
col = []
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute("""SELECT
|
||||
name, city_id FROM cities
|
||||
WHERE city_id NOT IN
|
||||
(SELECT city_id FROM olx_data
|
||||
WHERE date=date('now'));
|
||||
""")
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
col.append(row)
|
||||
return col
|
||||
|
||||
|
||||
# con = DbConnect("/home/krzychu/PycharmProjects/OLX_selenium/test_table.db")
|
||||
# con.get_connection()
|
||||
# regions = con.get_column("regions", "name, region_id")
|
||||
# cities = con.get_column("cities", "city_id, name, city_region_id")
|
||||
|
||||
# cities_div = {}
|
||||
#
|
||||
# for el in regions:
|
||||
# # print(el)
|
||||
# cities_list = []
|
||||
# for i in cities:
|
||||
# if i[2] == el[1]:
|
||||
# cities_list.append(i[1])
|
||||
# cities_div[el[0]] = cities_list
|
||||
#
|
||||
# print(cities_div)
|
||||
|
||||
# print(regions)
|
||||
# print(cities)
|
||||
|
||||
# add_record = """
|
||||
# INSERT INTO
|
||||
# olx_data(city_id, adv_sale_count, adv_rent_count, adv_exchange_count)
|
||||
# VALUES
|
||||
# (4548, 629, 837, 7);
|
||||
# """
|
||||
|
||||
# run_query(connection=con, sql_query=add_record)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,84 @@
|
|||
#!/home/krzychu/PycharmProjects/parser_OLX/venv/bin/python
|
||||
|
||||
from db_con import DbConnect
|
||||
from sel_source import SelRequest
|
||||
import logging
|
||||
import time
|
||||
import csv
|
||||
from datetime import date
|
||||
|
||||
|
||||
def removeaccents(input_text):
|
||||
strange = 'ĄĆĘÓŁŚŻŹŃąćęółśżźń'
|
||||
ascii_replacements = 'ACEOLSZZNaceolszzn'
|
||||
translator = str.maketrans(strange, ascii_replacements)
|
||||
return input_text.translate(translator)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.FileHandler('logfile.log')
|
||||
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
|
||||
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
|
||||
|
||||
|
||||
con = DbConnect("olx_data.db")
|
||||
con.get_connection()
|
||||
|
||||
cities = con.get_data("""select c.city_id, c.name
|
||||
from cities c left join
|
||||
(select city_id from olx_data where date = date('now')) o
|
||||
on o.city_id=c.city_id;""")
|
||||
|
||||
url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
|
||||
cities_list = [(url_base + removeaccents(i[1]).lower().replace(" ", "-") + '/', i[0]) for i in cities]
|
||||
|
||||
with open('urls.csv', 'w', newline='') as f:
|
||||
writer = csv.writer(f, lineterminator='\n')
|
||||
header = ['rent', 'sale', 'exchange', 'city_id']
|
||||
writer.writerow(header)
|
||||
for i in cities_list:
|
||||
start = time.time()
|
||||
curTime = date.today()
|
||||
row = SelRequest(i[0]).get_olx_stats()
|
||||
row.append(i[1])
|
||||
row.append(curTime)
|
||||
record = """
|
||||
INSERT INTO
|
||||
olx_data(adv_rent_count, adv_sale_count, adv_exchange_count, city_id, date)
|
||||
VALUES
|
||||
(?, ?, ?, ?, ?); """
|
||||
con.run_query(record, row)
|
||||
end = time.time()
|
||||
logger.info(f'executed in {end-start}')
|
||||
|
||||
|
||||
# with open('first.csv', 'rb') as inp, open('first_edit.csv', 'wb') as out:
|
||||
# writer = csv.writer(out)
|
||||
# for row in csv.reader(inp):
|
||||
# if row[2] != "0":
|
||||
# writer.writerow(row)
|
||||
|
||||
# add_record = """
|
||||
# INSERT INTO
|
||||
# olx_data(city_id, adv_sale_count, adv_rent_count, adv_exchange_count)
|
||||
# VALUES
|
||||
# (4548, 629, 837, 7);
|
||||
# """
|
||||
|
||||
|
||||
# cities_div_urls = {}
|
||||
# url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
|
||||
#
|
||||
# for el in regions:
|
||||
# cities_list = [(url_base+removeaccents(i[1]).lower().replace(" ", "-")+'/', i[0]) for i in cities if i[2] == el[1]]
|
||||
# cities_div_urls[el[0]] = cities_list
|
||||
#
|
||||
# reg_urls = [(url_base+removeaccents(el[0]).lower()+'/', el[1]) for el in regions]
|
||||
|
||||
# select c.city_id, o.city_id from cities c left join
|
||||
# (select city_id from olx_data where date = date('now')) o on o.city_id=c.city_id;
|
||||
|
||||
# select name, city_id from cities where city_id not in
|
||||
# (select city_id from olx_data where date=date('now'));
|
||||
|
||||
# select name from olx_data join cities on olx_data.city_id = cities.city_id where date = date('now');
|
Binary file not shown.
|
@ -0,0 +1,28 @@
|
|||
async-generator==1.10
|
||||
attrs==21.4.0
|
||||
beautifulsoup4==4.11.1
|
||||
bs4==0.0.1
|
||||
certifi==2022.6.15
|
||||
cffi==1.15.1
|
||||
charset-normalizer==2.1.0
|
||||
cryptography==37.0.4
|
||||
h11==0.13.0
|
||||
idna==3.3
|
||||
lxml==4.9.1
|
||||
outcome==1.2.0
|
||||
pybrowsers==0.5.0
|
||||
pycparser==2.21
|
||||
pyOpenSSL==22.0.0
|
||||
PySocks==1.7.1
|
||||
python-dotenv==0.20.0
|
||||
pyxdg==0.28
|
||||
requests==2.28.1
|
||||
selenium==4.3.0
|
||||
sniffio==1.2.0
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.3.2.post1
|
||||
trio==0.21.0
|
||||
trio-websocket==0.9.2
|
||||
urllib3==1.26.10
|
||||
webdriver-manager==3.8.0
|
||||
wsproto==1.1.0
|
|
@ -0,0 +1,66 @@
|
|||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.FileHandler('logfile.log')
|
||||
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
|
||||
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
|
||||
|
||||
|
||||
class SelRequest:
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.options = Options()
|
||||
self.options.add_argument('--headless')
|
||||
self.options.add_argument('--no-sandbox')
|
||||
self.options.add_argument('--disable-dev-shm-usage')
|
||||
self.driver = webdriver.Chrome(options=self.options)
|
||||
self.source = ''
|
||||
|
||||
def get_source(self):
|
||||
self.driver.get(self.url)
|
||||
logger.info(f'getting URL = {self.url}')
|
||||
self.driver.execute_script("return document.body.innerHTML")
|
||||
self.source = self.driver.page_source
|
||||
self.driver.close()
|
||||
return self.source
|
||||
|
||||
def get_olx_stats(self):
|
||||
soup = BeautifulSoup(self.get_source(), "html.parser")
|
||||
span = soup.find_all('a', {'class': 'css-pyvavn'})
|
||||
lookfor = ["Wynajem", "Sprzedaż", "Zamiana"]
|
||||
stat = []
|
||||
for i in lookfor:
|
||||
a = '0'
|
||||
for el in span:
|
||||
if str(el).find(i) > 0:
|
||||
a = str(el)[str(el).find('>', str(el).find(i) + 20) + 1:str(el).find('<', str(el).find(i) + 20)]
|
||||
try:
|
||||
stat.append(int(a))
|
||||
except ValueError:
|
||||
stat.append(int(a.replace(u'\xa0', u'')))
|
||||
logger.info(f'found data: {stat}')
|
||||
return stat
|
||||
|
||||
|
||||
# link = 'https://www.olx.pl/d/nieruchomosci/mieszkania/lubin/'
|
||||
#
|
||||
# options = Options()
|
||||
# options.add_argument('--headless')
|
||||
# options.add_argument('--no-sandbox')
|
||||
# options.add_argument('--disable-dev-shm-usage')
|
||||
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||
#
|
||||
# driver.get(link)
|
||||
# innerHTML = driver.execute_script("return document.body.innerHTML")
|
||||
# sc =
|
||||
# driver.close()
|
||||
#
|
||||
# soup = BeautifulSoup(sc, "html.parser")
|
||||
# # print(soup.prettify())
|
||||
# span = soup.find_all('a', {'class': 'css-pyvavn'})
|
||||
#
|
||||
# print(span)
|
Loading…
Reference in New Issue