now it runs in a loop until all cities are done.

This commit is contained in:
Krzychu 2022-07-20 11:14:09 +02:00
parent bdd8ebe258
commit 9db83ae3b8
3 changed files with 41 additions and 36 deletions

View File

@ -1,7 +1,7 @@
import sqlite3
from sqlite3 import Error
import logging
import time
logger = logging.getLogger(__name__)
logging.FileHandler('logfile.log')
@ -28,11 +28,13 @@ class DbConnect:
def insert_data(self, sql_query, values): # insert
cursor = self.connection.cursor()
try:
start = time.time()
cursor.execute(sql_query, values)
self.connection.commit()
end = time.time()
logger.debug(sql_query)
logger.debug(values)
logger.info("SQL query run successfully in ... s") #dodać czas
logger.info(f"SQL query run successfully in {end-start} s")
except Error as e:
print(f" Query Failed……{e}")

65
main.py
View File

@ -12,41 +12,44 @@ logging.FileHandler('logfile.log')
logging.basicConfig(filename='logfile.log', filemode='w', level=logging.INFO,
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
con = DbConnect("sync/olx_data.db")
con.get_connection()
cities = con.get_data("""select c.city_id, c.name, c.is_duplicate
from cities_custom c left join
(select city_id from olx_data where date = date('now')) o
on o.city_id=c.city_id where o.city_id is null;""")
# double, cities_custom
url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
cities_list = [(url_base + con.remove_accents(i[1]).lower().replace(" ", "-") + i[2] + '/', i[0]) for i in cities]
drive = SelRequest()
for i in cities_list:
start = time.time()
curTime = date.today()
print(i[0])
source = drive.get_source(i[0])
values = drive.get_olx_stats(source)
values.append(i[1])
values.append(curTime)
end = time.time()
if end-start < 2:
print("Too quick, something is wrong with: ", i[0], '-- id:', i[1])
pass
query = """
INSERT INTO
olx_data(adv_rent_count, adv_sale_count, adv_exchange_count, city_id, date)
VALUES
(?, ?, ?, ?, ?); """
con.insert_data(query, values)
logger.info(f'loop executed in {end-start}')
cities_not_done = """select c.city_id, c.name, c.is_duplicate
from cities_custom c left join
(select city_id from olx_data where date = date('now')) o
on o.city_id=c.city_id where o.city_id is null;"""
cities = con.get_data(cities_not_done)
url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
cities_list = [(url_base + con.remove_accents(i[1]).lower().replace(" ", "-") + i[2] + '/', i[0]) for i in cities]
not_empty = len(cities_list)
while not_empty > 0:
for i in cities_list:
start = time.time()
curTime = date.today()
# print(i[0])
try:
source = drive.get_source(i[0])
values = drive.get_olx_stats(source)
values.append(i[1])
values.append(curTime)
query = """
INSERT INTO
olx_data(adv_rent_count, adv_sale_count, adv_exchange_count, city_id, date)
VALUES
(?, ?, ?, ?, ?); """
con.insert_data(query, values)
except Exception as e:
logger.info(f'an exception has occurred:{e}')
pass
end = time.time()
logger.info(f'loop executed in {end-start}')
cities = con.get_data(cities_not_done)
cities_list = [(url_base + con.remove_accents(i[1]).lower().replace(" ", "-") + i[2] + '/', i[0]) for i in cities]
not_empty = len(cities_list)
# with open('first.csv', 'rb') as inp, open('first_edit.csv', 'wb') as out:

View File

@ -43,8 +43,8 @@ class SelRequest:
def get_olx_stats(self, source):
soup = BeautifulSoup(source, "html.parser")
span = soup.find_all('a', {'class': 'css-pyvavn'})
for i in span:
print(i.contents)
# for i in span:
# print(i.contents)
# this is how span looks like:
# [<a class="css-pyvavn" href="/d/nieruchomosci/mieszkania/wynajem/boleslawiec">Wynajem<span class="css-wz88">15</span></a>,
# <a class="css-pyvavn" href="/d/nieruchomosci/mieszkania/sprzedaz/boleslawiec">Sprzedaż<span class="css - wz88">35</span></a>,
@ -72,7 +72,7 @@ class SelRequest:
except ValueError:
stat.append(int(value.replace(u'\xa0', u'')))
logger.info(f'found data: {stat}')
print(stat)
# print(stat)
return stat