Skip to content

Commit d5cc809

Browse files
NikaNika
authored andcommitted
Scrape myhome.ge for conditions that suits my needs
0 parents  commit d5cc809

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed

scrape-development.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
from selenium import webdriver
2+
from selenium.webdriver.chrome.options import Options
3+
from selenium.webdriver.chrome.service import Service
4+
from selenium.webdriver.common.by import By
5+
from webdriver_manager.chrome import ChromeDriverManager
6+
from playsound import playsound
7+
import time
8+
9+
# save current time to watch how much time it took to complete task
10+
start_time = time.time()
11+
12+
print('Traversing throughout the internet. Keep patience...')
13+
14+
options = Options()
15+
16+
# true if you don't went to pop up the GUI (Chrome window in this case)
17+
options.headless = True
18+
# don't close GUI upon completing the code
19+
options.add_experimental_option("detach", True)
20+
options.add_argument("--window-size=1920,1200")
21+
22+
driver = webdriver.Chrome(options=options, service=Service(ChromeDriverManager().install()))
23+
24+
driver.get('https://www.myhome.ge/ka/s/iyideba-bina-Tbilisi?Keyword=%E1%83%97%E1%83%91%E1%83%98%E1%83%9A%E1%83%98%E1%83%A1%E1%83%98&AdTypeID=1&PrTypeID=1&mapC=41.73188365%2C44.8368762993663&regions=687611312.687586034.688137211.687602533.687618311.-1&districts=26445359.2022621279.58873656.1651252654.737816010.2172993612.28045012.737261604.152296216&cities=1996871&GID=1996871')
25+
last_page = driver.find_elements(By.XPATH, '//li[@class="page-item number space-item-last"]/a[@class="page-link"]')[0].text
26+
forward_button = ''
27+
search_keywords = [
28+
'saswrafod',
29+
'saswrafot',
30+
'sastsrafod',
31+
'sastsrafot',
32+
'mechqareba',
33+
'mechkareba',
34+
'sachqarod',
35+
'sachkarod',
36+
'sachqarot',
37+
'sachkarot',
38+
'სასწრაფოთ',
39+
'სასწრაფოდ',
40+
'მეჩქარება',
41+
'საჩქაროთ',
42+
'საჩქაროდ'
43+
]
44+
45+
# workaround for .click() method. Since that method didn't work. My guess is the website tries to block crawlers
46+
# so this solution is also viable
47+
def click(element):
48+
driver.execute_script("arguments[0].click();", element)
49+
50+
# you can use any condition here
51+
def condition_check(price, sqm, floor=4):
52+
cond1 = price < 1700
53+
cond2 = sqm > 90
54+
cond3 = floor > 3
55+
return (cond1 & cond2 & cond3)
56+
57+
# you can use any condition here
58+
def keyword_check(text):
59+
for key in search_keywords:
60+
if key in text:
61+
return True
62+
return False
63+
64+
# here I check all cards on provided website and if all my conditions are met
65+
# each card I open in new tab and check on my condition functions above
66+
# in case of success I write the tab link in a file and close the tab
67+
# playsound is library for music player
68+
# if you use playsound part here make sure to install 'PyObjC' package as well
69+
def check_perform():
70+
cards = driver.find_elements(By.XPATH, '//div[@class="statement-card"]/a[@class="card-container"]')
71+
72+
for card in cards:
73+
click(card)
74+
driver.switch_to.window(driver.window_handles[1])
75+
76+
price_elem = driver.find_elements(By.XPATH, '//div[@class="price_icons_wrap d-flex align-items-end justify-content-between"]/span[@class="d-block price-per-square"]/span[@class="convertable"]')
77+
sqm_elem = driver.find_elements(By.XPATH, '//div[@class="main-features row no-gutters"]/div[@class="col-6 col-lg-4 mb-0 mb-md-4 mb-lg-0 d-flex align-items-center mb-lg-0 mb-4 pr-2 pr-lg-0"]//span[@class="d-block"]')
78+
desc_elem = driver.find_elements(By.XPATH, '//p[@class="pr-comment translated"]')
79+
80+
if (price_elem and price_elem[0]):
81+
price = int(price_elem[0].text)
82+
else:
83+
price = 99999999
84+
if (sqm_elem and sqm_elem[0]):
85+
sqm = int(sqm_elem[0].text[:-6])
86+
else:
87+
sqm = 0
88+
if (desc_elem and desc_elem[0]):
89+
desc = desc_elem[0].text
90+
else:
91+
desc = ''
92+
93+
if (keyword_check(desc)):
94+
# it's always good to provide absolute paths
95+
# which I didn't do XD
96+
with open('appartment-info.txt', 'a') as f:
97+
f.write('\n'+driver.current_url)
98+
playsound('/Users/nika/Documents/Music/Sounds/notification_sound.mp3')
99+
# closing tab
100+
driver.close()
101+
# by the rules of this program there can only be two tabs
102+
# but when we close the second tab driver doesn't automatically switch target
103+
# to previous tab. Need to set it manually
104+
driver.switch_to.window(driver.window_handles[0])
105+
106+
# since there is a pagination we want to run through all the pages
107+
# and wait for 20s to make sure ajax loaded the content
108+
for page in range(int(last_page)):
109+
forward_button = driver.find_elements(By.XPATH, '//li[@class="page-item step-forward-item"]/a[@class="page-link step-forward active-step"]')[0]
110+
check_perform()
111+
click(forward_button)
112+
time.sleep(20)
113+
114+
115+
print('Process ended. Check the output file!')
116+
117+
# upon ending the code running show how much time it took to complete all the work
118+
print("--- %s seconds ---" % (time.time() - start_time))

0 commit comments

Comments
 (0)