KSA: Initial commit

This commit is contained in:
Krzychu 2022-07-11 18:36:52 +02:00
parent 681dfbd88a
commit a8dab495fe
16 changed files with 2139 additions and 0 deletions

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

1
.idea/.name Normal file
View File

@ -0,0 +1 @@
parser_OLX

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/parser_OLX.iml" filepath="$PROJECT_DIR$/.idea/parser_OLX.iml" />
</modules>
</component>
</project>

10
.idea/parser_OLX.iml Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

Binary file not shown.

Binary file not shown.

89
db_con.py Normal file
View File

@ -0,0 +1,89 @@
import sqlite3
from sqlite3 import Error
import logging
logger = logging.getLogger(__name__)
logging.FileHandler('logfile.log')
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
class DbConnect:
def __init__(self, db_path):
self.connection = None
self.db_path = db_path
def get_connection(self):
try:
self.connection = sqlite3.connect(self.db_path)
logger.info("database connected successfully")
except Error as e:
logger.critical(f"error: {e}")
# print(f"An Error has occurred: {e}")
# return connection
def run_query(self, sql_query, row):
cursor = self.connection.cursor()
try:
cursor.execute(sql_query, row)
self.connection.commit()
logger.debug(sql_query)
logger.debug(row)
logger.info("SQL query run successfully")
except Error as e:
print(f" Query Failed……{e}")
def get_data(self, query):
col = []
cursor = self.connection.cursor()
cursor.execute(query)
rows = cursor.fetchall()
for row in rows:
col.append(row)
return col
def get_notready(self):
col = []
cursor = self.connection.cursor()
cursor.execute("""SELECT
name, city_id FROM cities
WHERE city_id NOT IN
(SELECT city_id FROM olx_data
WHERE date=date('now'));
""")
rows = cursor.fetchall()
for row in rows:
col.append(row)
return col
# con = DbConnect("/home/krzychu/PycharmProjects/OLX_selenium/test_table.db")
# con.get_connection()
# regions = con.get_column("regions", "name, region_id")
# cities = con.get_column("cities", "city_id, name, city_region_id")
# cities_div = {}
#
# for el in regions:
# # print(el)
# cities_list = []
# for i in cities:
# if i[2] == el[1]:
# cities_list.append(i[1])
# cities_div[el[0]] = cities_list
#
# print(cities_div)
# print(regions)
# print(cities)
# add_record = """
# INSERT INTO
# olx_data(city_id, adv_sale_count, adv_rent_count, adv_exchange_count)
# VALUES
# (4548, 629, 837, 7);
# """
# run_query(connection=con, sql_query=add_record)

1833
logfile.log Normal file

File diff suppressed because it is too large Load Diff

84
main.py Normal file
View File

@ -0,0 +1,84 @@
#!/home/krzychu/PycharmProjects/parser_OLX/venv/bin/python
from db_con import DbConnect
from sel_source import SelRequest
import logging
import time
import csv
from datetime import date
def removeaccents(input_text):
strange = 'ĄĆĘÓŁŚŻŹŃąćęółśżźń'
ascii_replacements = 'ACEOLSZZNaceolszzn'
translator = str.maketrans(strange, ascii_replacements)
return input_text.translate(translator)
logger = logging.getLogger(__name__)
logging.FileHandler('logfile.log')
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
con = DbConnect("olx_data.db")
con.get_connection()
cities = con.get_data("""select c.city_id, c.name
from cities c left join
(select city_id from olx_data where date = date('now')) o
on o.city_id=c.city_id;""")
url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
cities_list = [(url_base + removeaccents(i[1]).lower().replace(" ", "-") + '/', i[0]) for i in cities]
with open('urls.csv', 'w', newline='') as f:
writer = csv.writer(f, lineterminator='\n')
header = ['rent', 'sale', 'exchange', 'city_id']
writer.writerow(header)
for i in cities_list:
start = time.time()
curTime = date.today()
row = SelRequest(i[0]).get_olx_stats()
row.append(i[1])
row.append(curTime)
record = """
INSERT INTO
olx_data(adv_rent_count, adv_sale_count, adv_exchange_count, city_id, date)
VALUES
(?, ?, ?, ?, ?); """
con.run_query(record, row)
end = time.time()
logger.info(f'executed in {end-start}')
# with open('first.csv', 'rb') as inp, open('first_edit.csv', 'wb') as out:
# writer = csv.writer(out)
# for row in csv.reader(inp):
# if row[2] != "0":
# writer.writerow(row)
# add_record = """
# INSERT INTO
# olx_data(city_id, adv_sale_count, adv_rent_count, adv_exchange_count)
# VALUES
# (4548, 629, 837, 7);
# """
# cities_div_urls = {}
# url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
#
# for el in regions:
# cities_list = [(url_base+removeaccents(i[1]).lower().replace(" ", "-")+'/', i[0]) for i in cities if i[2] == el[1]]
# cities_div_urls[el[0]] = cities_list
#
# reg_urls = [(url_base+removeaccents(el[0]).lower()+'/', el[1]) for el in regions]
# select c.city_id, o.city_id from cities c left join
# (select city_id from olx_data where date = date('now')) o on o.city_id=c.city_id;
# select name, city_id from cities where city_id not in
# (select city_id from olx_data where date=date('now'));
# select name from olx_data join cities on olx_data.city_id = cities.city_id where date = date('now');

BIN
olx_data.db Normal file

Binary file not shown.

28
requirements.txt Normal file
View File

@ -0,0 +1,28 @@
async-generator==1.10
attrs==21.4.0
beautifulsoup4==4.11.1
bs4==0.0.1
certifi==2022.6.15
cffi==1.15.1
charset-normalizer==2.1.0
cryptography==37.0.4
h11==0.13.0
idna==3.3
lxml==4.9.1
outcome==1.2.0
pybrowsers==0.5.0
pycparser==2.21
pyOpenSSL==22.0.0
PySocks==1.7.1
python-dotenv==0.20.0
pyxdg==0.28
requests==2.28.1
selenium==4.3.0
sniffio==1.2.0
sortedcontainers==2.4.0
soupsieve==2.3.2.post1
trio==0.21.0
trio-websocket==0.9.2
urllib3==1.26.10
webdriver-manager==3.8.0
wsproto==1.1.0

66
sel_source.py Normal file
View File

@ -0,0 +1,66 @@
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import logging
logger = logging.getLogger(__name__)
logging.FileHandler('logfile.log')
logging.basicConfig(filename='logfile.log', filemode='a', level=logging.INFO,
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
class SelRequest:
def __init__(self, url):
self.url = url
self.options = Options()
self.options.add_argument('--headless')
self.options.add_argument('--no-sandbox')
self.options.add_argument('--disable-dev-shm-usage')
self.driver = webdriver.Chrome(options=self.options)
self.source = ''
def get_source(self):
self.driver.get(self.url)
logger.info(f'getting URL = {self.url}')
self.driver.execute_script("return document.body.innerHTML")
self.source = self.driver.page_source
self.driver.close()
return self.source
def get_olx_stats(self):
soup = BeautifulSoup(self.get_source(), "html.parser")
span = soup.find_all('a', {'class': 'css-pyvavn'})
lookfor = ["Wynajem", "Sprzedaż", "Zamiana"]
stat = []
for i in lookfor:
a = '0'
for el in span:
if str(el).find(i) > 0:
a = str(el)[str(el).find('>', str(el).find(i) + 20) + 1:str(el).find('<', str(el).find(i) + 20)]
try:
stat.append(int(a))
except ValueError:
stat.append(int(a.replace(u'\xa0', u'')))
logger.info(f'found data: {stat}')
return stat
# link = 'https://www.olx.pl/d/nieruchomosci/mieszkania/lubin/'
#
# options = Options()
# options.add_argument('--headless')
# options.add_argument('--no-sandbox')
# options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
#
# driver.get(link)
# innerHTML = driver.execute_script("return document.body.innerHTML")
# sc =
# driver.close()
#
# soup = BeautifulSoup(sc, "html.parser")
# # print(soup.prettify())
# span = soup.find_all('a', {'class': 'css-pyvavn'})
#
# print(span)

1
urls.csv Normal file
View File

@ -0,0 +1 @@
rent,sale,exchange,city_id
1 rent sale exchange city_id