Optimised code to run faster.
This commit is contained in:
parent
1550d4f34e
commit
8bb3e62e7f
53
db_con.py
53
db_con.py
@ -18,45 +18,58 @@ class DbConnect:
|
||||
def get_connection(self):
|
||||
try:
|
||||
self.connection = sqlite3.connect(self.db_path)
|
||||
print("Database connected successfully.")
|
||||
logger.info("database connected successfully")
|
||||
except Error as e:
|
||||
logger.critical(f"error: {e}")
|
||||
# print(f"An Error has occurred: {e}")
|
||||
# return connection
|
||||
|
||||
def run_query(self, sql_query, row):
|
||||
def insert_data(self, sql_query, values): # insert
|
||||
cursor = self.connection.cursor()
|
||||
try:
|
||||
cursor.execute(sql_query, row)
|
||||
cursor.execute(sql_query, values)
|
||||
self.connection.commit()
|
||||
logger.debug(sql_query)
|
||||
logger.debug(row)
|
||||
logger.info("SQL query run successfully")
|
||||
logger.debug(values)
|
||||
logger.info("SQL query run successfully in ... s") #dodać czas
|
||||
except Error as e:
|
||||
print(f" Query Failed……{e}")
|
||||
|
||||
def remove_accents(self, input_text):
|
||||
strange = 'ĄĆĘÓŁŚŻŹŃąćęółśżźń'
|
||||
ascii_replacements = 'ACEOLSZZNaceolszzn'
|
||||
translator = str.maketrans(strange, ascii_replacements)
|
||||
return input_text.translate(translator)
|
||||
|
||||
def get_data(self, query):
|
||||
col = []
|
||||
records = []
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute(query)
|
||||
# exception
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
col.append(row)
|
||||
return col
|
||||
records.append(row)
|
||||
return records
|
||||
|
||||
def get_notready(self):
|
||||
col = []
|
||||
cursor = self.connection.cursor()
|
||||
cursor.execute("""SELECT
|
||||
name, city_id FROM cities
|
||||
WHERE city_id NOT IN
|
||||
(SELECT city_id FROM olx_data
|
||||
WHERE date=date('now'));
|
||||
""")
|
||||
rows = cursor.fetchall()
|
||||
for row in rows:
|
||||
col.append(row)
|
||||
return col
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# def get_notready(self):
|
||||
# col = []
|
||||
# cursor = self.connection.cursor()
|
||||
# cursor.execute("""SELECT
|
||||
# name, city_id FROM cities
|
||||
# WHERE city_id NOT IN
|
||||
# (SELECT city_id FROM olx_data
|
||||
# WHERE date=date('now'));
|
||||
# """)
|
||||
# rows = cursor.fetchall()
|
||||
# for row in rows:
|
||||
# col.append(row)
|
||||
# return col
|
||||
|
||||
|
||||
# con = DbConnect("/home/krzychu/PycharmProjects/OLX_selenium/test_table.db")
|
||||
|
44
main.py
44
main.py
@ -4,49 +4,49 @@ from db_con import DbConnect
|
||||
from sel_source import SelRequest
|
||||
import logging
|
||||
import time
|
||||
import csv
|
||||
from datetime import date
|
||||
|
||||
|
||||
def removeaccents(input_text):
|
||||
strange = 'ĄĆĘÓŁŚŻŹŃąćęółśżźń'
|
||||
ascii_replacements = 'ACEOLSZZNaceolszzn'
|
||||
translator = str.maketrans(strange, ascii_replacements)
|
||||
return input_text.translate(translator)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.FileHandler('logfile.log')
|
||||
logging.basicConfig(filename='logfile.log', filemode='w', level=logging.INFO,
|
||||
format='%(asctime)s :: %(levelname)s :: %(name)s :: %(message)s')
|
||||
|
||||
|
||||
con = DbConnect("olx_data.db")
|
||||
con = DbConnect("sync/olx_data.db")
|
||||
con.get_connection()
|
||||
|
||||
cities = con.get_data("""select city_id, name, is_dubble
|
||||
from custom where city_id not in
|
||||
(select city_id from olx_data
|
||||
where date = date('now'));""")
|
||||
cities = con.get_data("""select c.city_id, c.name, c.is_dubble
|
||||
from custom c left join
|
||||
(select city_id from olx_data where date = date('now')) o
|
||||
on o.city_id=c.city_id where o.city_id is null;""")
|
||||
|
||||
# double, cities_custom
|
||||
|
||||
url_base = 'https://www.olx.pl/d/nieruchomosci/mieszkania/'
|
||||
cities_list = [(url_base + removeaccents(i[1]).lower().replace(" ", "-") + i[2] + '/', i[0]) for i in cities]
|
||||
cities_list = [(url_base + con.remove_accents(i[1]).lower().replace(" ", "-") + i[2] + '/', i[0]) for i in cities]
|
||||
|
||||
drive = SelRequest()
|
||||
|
||||
for i in cities_list:
|
||||
start = time.time()
|
||||
curTime = date.today()
|
||||
row = SelRequest(i[0]).get_olx_stats()
|
||||
row.append(i[1])
|
||||
row.append(curTime)
|
||||
record = """
|
||||
print(i[0])
|
||||
source = drive.get_source(i[0])
|
||||
values = drive.get_olx_stats(source)
|
||||
values.append(i[1])
|
||||
values.append(curTime)
|
||||
end = time.time()
|
||||
if end-start < 2:
|
||||
print("Too quick, something is wrong with: ", i[0], '-- id:', i[1])
|
||||
pass
|
||||
query = """
|
||||
INSERT INTO
|
||||
olx_data(adv_rent_count, adv_sale_count, adv_exchange_count, city_id, date)
|
||||
VALUES
|
||||
(?, ?, ?, ?, ?); """
|
||||
con.run_query(record, row)
|
||||
end = time.time()
|
||||
logger.info(f'executed in {end-start}')
|
||||
con.insert_data(query, values)
|
||||
logger.info(f'loop executed in {end-start}')
|
||||
|
||||
|
||||
# with open('first.csv', 'rb') as inp, open('first_edit.csv', 'wb') as out:
|
||||
@ -83,4 +83,4 @@ for i in cities_list:
|
||||
# select c.city_id, c.name, c.is_dubble
|
||||
# from custom c left join
|
||||
# (select city_id from olx_data where date = date('now')) o
|
||||
# on o.city_id=c.city_id;
|
||||
# on o.city_id=c.city_id where o.city_id is null;
|
||||
|
103
sel_source.py
103
sel_source.py
@ -2,6 +2,7 @@ from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@ -12,8 +13,7 @@ logging.basicConfig(filename='logfile.log', filemode='w', level=logging.INFO,
|
||||
|
||||
class SelRequest:
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
def __init__(self):
|
||||
self.options = Options()
|
||||
self.options.add_argument('--headless')
|
||||
self.options.add_argument('--no-sandbox')
|
||||
@ -21,35 +21,100 @@ class SelRequest:
|
||||
self.driver = webdriver.Chrome(options=self.options)
|
||||
self.source = ''
|
||||
|
||||
def get_source(self):
|
||||
def get_source(self, url):
|
||||
logger.info(f'getting URL = {url}')
|
||||
start = time.time()
|
||||
self.driver.get(self.url)
|
||||
logger.info(f'getting URL = {self.url}')
|
||||
self.driver.execute_script("return document.body.innerHTML")
|
||||
self.source = self.driver.page_source
|
||||
self.driver.close()
|
||||
self.driver.get(url)
|
||||
end = time.time()
|
||||
logger.info(f'source obtained in {end-start}')
|
||||
return self.source
|
||||
logger.debug(f'driver.get() took {end-start} s.')
|
||||
start = time.time()
|
||||
self.driver.execute_script("return document.body.innerHTML")
|
||||
end = time.time()
|
||||
logger.debug(f'driver.execute_script() took {end - start} s.')
|
||||
start = time.time()
|
||||
source = self.driver.page_source
|
||||
end = time.time()
|
||||
logger.debug(f'driver.page_source took {end - start} s.')
|
||||
# self.driver.close()
|
||||
end = time.time()
|
||||
logger.debug(f'source obtained in {end-start}')
|
||||
return source
|
||||
|
||||
def get_olx_stats(self):
|
||||
soup = BeautifulSoup(self.get_source(), "html.parser")
|
||||
def get_olx_stats(self, source):
|
||||
soup = BeautifulSoup(source, "html.parser")
|
||||
span = soup.find_all('a', {'class': 'css-pyvavn'})
|
||||
for i in span:
|
||||
print(i.contents)
|
||||
# this is how span looks like:
|
||||
# [<a class="css-pyvavn" href="/d/nieruchomosci/mieszkania/wynajem/boleslawiec">Wynajem<span class="css-wz88">15</span></a>,
|
||||
# <a class="css-pyvavn" href="/d/nieruchomosci/mieszkania/sprzedaz/boleslawiec">Sprzedaż<span class="css - wz88">35</span></a>,
|
||||
# <a class="css-pyvavn" href="/d/nieruchomosci/mieszkania/zamiana/boleslawiec">Zamiana<span class="css-wz88">1</span></a>]
|
||||
|
||||
# this is how span.contents looks like:
|
||||
# ['Wynajem', < span class ="css-wz88" > 15 < / span >]
|
||||
# ['Sprzedaż', < span class ="css-wz88" > 35 < / span >]
|
||||
# ['Zamiana', < span class ="css-wz88" > 1 < / span >]
|
||||
|
||||
# this is how contents[1].string looks like:
|
||||
# 15
|
||||
# 35
|
||||
# 1
|
||||
lookfor = ["Wynajem", "Sprzedaż", "Zamiana"]
|
||||
stat = []
|
||||
for i in lookfor:
|
||||
a = '0'
|
||||
for el in span:
|
||||
if str(el).find(i) > 0:
|
||||
a = str(el)[str(el).find('>', str(el).find(i) + 20) + 1:str(el).find('<', str(el).find(i) + 20)]
|
||||
for ad_type in lookfor:
|
||||
value = '0'
|
||||
for content in span:
|
||||
if content.contents[0].string == ad_type:
|
||||
value = content.contents[1].string
|
||||
# print(value)
|
||||
try:
|
||||
stat.append(int(a))
|
||||
stat.append(int(value))
|
||||
except ValueError:
|
||||
stat.append(int(a.replace(u'\xa0', u'')))
|
||||
stat.append(int(value.replace(u'\xa0', u'')))
|
||||
logger.info(f'found data: {stat}')
|
||||
print(stat)
|
||||
return stat
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# lookfor = ["Wynajem", "Sprzedaż", "Zamiana"]
|
||||
# stat = []
|
||||
# for i in lookfor:
|
||||
# value = '0'
|
||||
# for el in span:
|
||||
# if str(el).find(i) > 0:
|
||||
# value = str(el)[str(el).find('>', str(el).find(i) + 20) + 1:str(el).find('<', str(el).find(i) + 20)]
|
||||
#
|
||||
# logger.info(f'found data: {stat}')
|
||||
# lookfor = ["Wynajem", "Sprzedaż", "Zamiana"]
|
||||
# stat = []
|
||||
# for i in lookfor:
|
||||
# value = '0'
|
||||
# for el in span:
|
||||
# if str(el[0]).find(i) > 0:
|
||||
# a = i.contents[1].string
|
||||
#
|
||||
# try:
|
||||
# stat.append(int(value))
|
||||
# except ValueError:
|
||||
# stat.append(int(value.replace(u'\xa0', u'')))
|
||||
# logger.info(f'found data: {stat}')
|
||||
# sys.exit()
|
||||
# return stat
|
||||
|
||||
|
||||
# link = 'https://www.olx.pl/d/nieruchomosci/mieszkania/lubin/'
|
||||
#
|
||||
# options = Options()
|
||||
|
Loading…
x
Reference in New Issue
Block a user