Skip to content

Commit 6997575

Browse files
authored
Merge branch 'master' into bug/work-times-undef
2 parents 0620d04 + 00c0500 commit 6997575

File tree

8 files changed

+105
-47
lines changed

8 files changed

+105
-47
lines changed

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,21 @@ First, you must set your chromedriver location by
5959
export CHROMEDRIVER=~/chromedriver
6060
```
6161

62+
## Sponsor
63+
[![rds-cost](https://raw.githubusercontent.com/joeyism/linkedin_scraper/master/docs/proxycurl.png)](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism)
64+
65+
Scrape public LinkedIn profile data at scale with [Proxycurl APIs](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism).
66+
67+
• Scraping Public profiles are battle tested in court in HiQ VS LinkedIn case.<br/>
68+
• GDPR, CCPA, SOC2 compliant<br/>
69+
• High rate limit - 300 requests/minute<br/>
70+
• Fast - APIs respond in ~2s<br/>
71+
• Fresh data - 88% of data is scraped real-time, other 12% are not older than 29 days<br/>
72+
• High accuracy<br/>
73+
• Tons of data points returned per profile
74+
75+
Built for developers, by developers.
76+
6277
## Usage
6378
To use it, just create the class.
6479

README.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,25 @@ First, you must set your chromedriver location by
7676
7777
export CHROMEDRIVER=~/chromedriver
7878
79+
Sponsor
80+
-----
81+
82+
.. image:: https://raw.githubusercontent.com/joeyism/linkedin_scraper/master/docs/proxycurl.png
83+
:alt: Proxycurl API
84+
:target: https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism
85+
86+
Scrape public LinkedIn profile data at scale with `Proxycurl APIs <https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism>`_.
87+
88+
• Scraping Public profiles are battle tested in court in HiQ VS LinkedIn case.
89+
• GDPR, CCPA, SOC2 compliant
90+
• High rate limit - 300 requests/minute
91+
• Fast - APIs respond in ~2s
92+
• Fresh data - 88% of data is scraped real-time, other 12% are not older than 29 days
93+
• High accuracy
94+
• Tons of data points returned per profile
95+
96+
Built for developers, by developers.
97+
7998
Usage
8099
-----
81100

docs/proxycurl.png

5.9 KB
Loading

linkedin_scraper/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .jobs import Job
66
from .job_search import JobSearch
77

8-
__version__ = "2.11.0"
8+
__version__ = "2.11.1"
99

1010
import glob
1111
modules = glob.glob(dirname(__file__)+"/*.py")

linkedin_scraper/job_search.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def scrape_job_card(self, base_element) -> Job:
3636
job_div = self.wait_for_element_to_load(name="job-card-list__title", base=base_element)
3737
job_title = job_div.text.strip()
3838
linkedin_url = job_div.get_attribute("href")
39-
company = base_element.find_element_by_class_name("job-card-container__primary-description")
40-
location = base_element.find_element_by_class_name("job-card-container__metadata-item")
39+
company = base_element.find_element_by_class_name("job-card-container__primary-description").text
40+
location = base_element.find_element_by_class_name("job-card-container__metadata-item").text
4141
job = Job(linkedin_url=linkedin_url, job_title=job_title, company=company, location=location, scrape=False, driver=self.driver)
4242
return job
4343

linkedin_scraper/jobs.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,27 @@ def __init__(
4040
self.scrape(close_on_complete)
4141

4242
def __repr__(self):
43-
return f"{self.job_title} {self.company}"
43+
return f"<Job {self.job_title} {self.company}>"
4444

4545
def scrape(self, close_on_complete=True):
4646
if self.is_signed_in():
4747
self.scrape_logged_in(close_on_complete=close_on_complete)
4848
else:
4949
raise NotImplemented("This part is not implemented yet")
5050

51+
def to_dict(self):
52+
return {
53+
"linkedin_url": self.linkedin_url,
54+
"job_title": self.job_title,
55+
"company": self.company,
56+
"company_linkedin_url": self.company_linkedin_url,
57+
"location": self.location,
58+
"posted_date": self.posted_date,
59+
"applicant_count": self.applicant_count,
60+
"job_description": self.job_description,
61+
"benefits": self.benefits
62+
}
63+
5164

5265
def scrape_logged_in(self, close_on_complete=True):
5366
driver = self.driver
@@ -63,8 +76,15 @@ def scrape_logged_in(self, close_on_complete=True):
6376
self.applicant_count = self.wait_for_element_to_load(name="jobs-unified-top-card__applicant-count").text.strip()
6477
except TimeoutException:
6578
self.applicant_count = 0
66-
self.job_description = self.wait_for_element_to_load(name="jobs-description").text.strip()
67-
self.benefits = self.wait_for_element_to_load(name="jobs-unified-description__salary-main-rail-card").text.strip()
79+
job_description_elem = self.wait_for_element_to_load(name="jobs-description")
80+
self.mouse_click(job_description_elem.find_element_by_tag_name("button"))
81+
job_description_elem = self.wait_for_element_to_load(name="jobs-description")
82+
job_description_elem.find_element_by_tag_name("button").click()
83+
self.job_description = job_description_elem.text.strip()
84+
try:
85+
self.benefits = self.wait_for_element_to_load(name="jobs-unified-description__salary-main-rail-card").text.strip()
86+
except TimeoutException:
87+
self.benefits = None
6888

6989
if close_on_complete:
7090
driver.close()

linkedin_scraper/objects.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from . import constants as c
77

8+
from selenium import webdriver
89
from selenium.webdriver.common.by import By
910
from selenium.webdriver.support.wait import WebDriverWait
1011
from selenium.webdriver.support import expected_conditions as EC
@@ -72,6 +73,10 @@ def focus(self):
7273
self.driver.execute_script('alert("Focus window")')
7374
self.driver.switch_to.alert.accept()
7475

76+
def mouse_click(self, elem):
77+
action = webdriver.ActionChains(self.driver)
78+
action.move_to_element(elem).perform()
79+
7580
def wait_for_element_to_load(self, by=By.CLASS_NAME, name="pv-top-card", base=None):
7681
base = base or self.driver
7782
return WebDriverWait(base, self.WAIT_FOR_ELEMENT_TIMEOUT).until(

linkedin_scraper/person.py

Lines changed: 40 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -102,67 +102,66 @@ def _click_see_more_by_class_name(self, class_name):
102102

103103
def is_open_to_work(self):
104104
try:
105-
return "#OPEN_TO_WORK" in self.driver.find_element_by_class_name("pv-top-card-profile-picture").find_element_by_tag_name("img").get_attribute("title")
105+
return "#OPEN_TO_WORK" in self.driver.find_element(By.CLASS_NAME,"pv-top-card-profile-picture").find_element(By.TAG_NAME,"img").get_attribute("title")
106106
except:
107107
return False
108108

109109
def get_experiences(self):
110110
url = os.path.join(self.linkedin_url, "details/experience")
111111
self.driver.get(url)
112112
self.focus()
113-
main = self.wait_for_element_to_load(by=By.ID, name="main")
113+
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
114114
self.scroll_to_half()
115115
self.scroll_to_bottom()
116116
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
117-
for position in main_list.find_elements_by_xpath("li"):
118-
position = position.find_element_by_class_name("pvs-entity")
119-
company_logo_elem, position_details = position.find_elements_by_xpath("*")
117+
for position in main_list.find_elements(By.XPATH,"li"):
118+
position = position.find_element(By.CLASS_NAME,"pvs-entity")
119+
company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
120120

121121
# company elem
122-
company_linkedin_url = company_logo_elem.find_element_by_xpath("*").get_attribute("href")
122+
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
123123

124124
# position details
125-
position_details_list = position_details.find_elements_by_xpath("*")
125+
position_details_list = position_details.find_elements(By.XPATH,"*")
126126
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
127127
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
128-
outer_positions = position_summary_details.find_element_by_xpath("*").find_elements_by_xpath("*")
128+
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
129129

130130
if len(outer_positions) == 4:
131-
position_title = outer_positions[0].find_element_by_tag_name("span").find_element_by_tag_name("span").text
132-
company = outer_positions[1].find_element_by_tag_name("span").text
133-
work_times = outer_positions[2].find_element_by_tag_name("span").text
134-
location = outer_positions[3].find_element_by_tag_name("span").text
131+
position_title = outer_positions[0].find_element(By.TAG_NAME,"span").find_element(By.TAG_NAME,"span").text
132+
company = outer_positions[1].find_element(By.TAG_NAME,"span").text
133+
work_times = outer_positions[2].find_element(By.TAG_NAME,"span").text
134+
location = outer_positions[3].find_element(By.TAG_NAME,"span").text
135135
elif len(outer_positions) == 3:
136136
if "·" in outer_positions[2].text:
137-
position_title = outer_positions[0].find_element_by_tag_name("span").find_element_by_tag_name("span").text
138-
company = outer_positions[1].find_element_by_tag_name("span").text
139-
work_times = outer_positions[2].find_element_by_tag_name("span").text
137+
position_title = outer_positions[0].find_element(By.TAG_NAME,"span").find_element(By.TAG_NAME,"span").text
138+
company = outer_positions[1].find_element(By.TAG_NAME,"span").text
139+
work_times = outer_positions[2].find_element(By.TAG_NAME,"span").text
140140
location = ""
141141
else:
142142
position_title = ""
143-
company = outer_positions[0].find_element_by_tag_name("span").find_element_by_tag_name("span").text
143+
company = outer_positions[0].find_element(By.TAG_NAME,"span").find_element(By.TAG_NAME,"span").text
144144
company = company[:company.find("\n")]
145-
work_times = outer_positions[1].find_element_by_tag_name("span").text
146-
location = outer_positions[2].find_element_by_tag_name("span").text
145+
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
146+
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
147147
else:
148148
# len(outer_positions) == 2
149-
company = outer_positions[0].find_element_by_tag_name("span").text
150-
work_times = outer_positions[1].find_element_by_tag_name("span").text
149+
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
150+
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
151151
position_title = ""
152152
location = ""
153153

154-
if position_summary_text and len(
155-
position_summary_text.find_element_by_class_name("pvs-list").find_element_by_class_name("pvs-list").find_elements_by_xpath("li")) > 1:
156-
descriptions = position_summary_text.find_element_by_class_name("pvs-list").find_element_by_class_name("pvs-list").find_elements_by_xpath("li")
154+
if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
155+
descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
157156
for description in descriptions:
158-
res = description.find_element_by_tag_name("a").find_elements_by_xpath("*")
157+
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
159158
position_title_elem = res[0] if len(res) > 0 else None
160159
work_times_elem = res[1] if len(res) > 1 else None
161160
location_elem = res[2] if len(res) > 2 else None
162161

163-
location = location_elem.find_element_by_xpath("*").text if location_elem else None
164-
position_title = position_title_elem.find_element_by_xpath("*").find_element_by_tag_name("*").text if position_title_elem else ""
165-
work_times = work_times_elem.find_element_by_xpath("*").text if work_times_elem else ""
162+
location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
163+
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
164+
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
166165
times = work_times.split("·")[0].strip() if work_times else ""
167166
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
168167
from_date = " ".join(times.split(" ")[:2]) if times else ""
@@ -204,27 +203,27 @@ def get_educations(self):
204203
url = os.path.join(self.linkedin_url, "details/education")
205204
self.driver.get(url)
206205
self.focus()
207-
main = self.wait_for_element_to_load(by=By.ID, name="main")
206+
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
208207
self.scroll_to_half()
209208
self.scroll_to_bottom()
210209
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
211-
for position in main_list.find_elements_by_class_name("pvs-entity"):
212-
institution_logo_elem, position_details = position.find_elements_by_xpath("*")
210+
for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"):
211+
institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")
213212

214213
# company elem
215-
institution_linkedin_url = institution_logo_elem.find_element_by_xpath("*").get_attribute("href")
214+
institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
216215

217216
# position details
218-
position_details_list = position_details.find_elements_by_xpath("*")
217+
position_details_list = position_details.find_elements(By.XPATH,"*")
219218
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
220219
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
221-
outer_positions = position_summary_details.find_element_by_xpath("*").find_elements_by_xpath("*")
220+
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
222221

223-
institution_name = outer_positions[0].find_element_by_tag_name("span").find_element_by_tag_name("span").text
224-
degree = outer_positions[1].find_element_by_tag_name("span").text
222+
institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").find_element(By.TAG_NAME,"span").text
223+
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
225224

226225
if len(outer_positions) > 2:
227-
times = outer_positions[2].find_element_by_tag_name("span").text
226+
times = outer_positions[2].find_element(By.TAG_NAME,"span").text
228227

229228
from_date = " ".join(times.split(" ")[:2])
230229
to_date = " ".join(times.split(" ")[3:])
@@ -247,14 +246,14 @@ def get_educations(self):
247246
self.add_education(education)
248247

249248
def get_name_and_location(self):
250-
top_panels = self.driver.find_elements_by_class_name("pv-text-details__left-panel")
251-
self.name = top_panels[0].find_elements_by_xpath("*")[0].text
252-
self.location = top_panels[1].find_element_by_tag_name("span").text
249+
top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
250+
self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
251+
self.location = top_panels[1].find_element(By.TAG_NAME,"span").text
253252

254253

255254
def get_about(self):
256255
try:
257-
about = self.driver.find_element_by_id("about").find_element_by_xpath("..").find_element_by_class_name("display-flex").text
256+
about = self.driver.find_element(By.ID,"about").find_element(By.XPATH,"..").find_element(By.CLASS_NAME,"display-flex").text
258257
except NoSuchElementException :
259258
about=None
260259
self.about = about
@@ -390,7 +389,7 @@ def job_title(self):
390389
return None
391390

392391
def __repr__(self):
393-
return "{name}\n\nAbout\n{about}\n\nExperience\n{exp}\n\nEducation\n{edu}\n\nInterest\n{int}\n\nAccomplishments\n{acc}\n\nContacts\n{conn}".format(
392+
return "<Person {name}\n\nAbout\n{about}\n\nExperience\n{exp}\n\nEducation\n{edu}\n\nInterest\n{int}\n\nAccomplishments\n{acc}\n\nContacts\n{conn}>".format(
394393
name=self.name,
395394
about=self.about,
396395
exp=self.experiences,

0 commit comments

Comments
 (0)