Skip to content

Commit 2894042

Browse files
authored
Added Linkedin_Scrapper Script (HarshCasper#878)
* Added Linkedin_Scrapper Script * Changes in Readme * Small Change in Readme * Changes in Readme * Changes in Readme * Small fix in main.py * Small fix in main.py * Small fix in main.py
1 parent 8071cad commit 2894042

File tree

3 files changed

+249
-0
lines changed

3 files changed

+249
-0
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Linkedin Scrapper
2+
3+
Script scraps the data of Linkedin Connections(first) of a User on Linkedin
4+
5+
It scraps the following data on All Connections
6+
7+
- Name
8+
- Linkedin_URL
9+
- Latest_JOB_Position
10+
- Skills
11+
12+
## Requirements
13+
14+
- Linkedin Profile
15+
- Python Setup on System
16+
- Chromium
17+
- Chrome
18+
- Install Selenium
19+
- Install bs4 (```pip3 install bs4```)
20+
- Replace usernmae and password in credentials.json with your gmail and password
21+
22+
## How to Run
23+
24+
```python3 main.py```
25+
26+
## Author
27+
28+
Sukriti Sood
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"username":"Enter your Email Address",
3+
"password":"Enter your Password"
4+
}
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
from selenium import webdriver
2+
from bs4 import BeautifulSoup
3+
import json
4+
import time
5+
from selenium.webdriver.common.by import By
6+
import csv
7+
browser = webdriver.Chrome()
8+
9+
pageurl="https://www.linkedin.com/uas/login?session_redirect=https%3A%2F%2Fwww%2Elinkedin%2Ecom%2Fsearch%2Fresults%2Fpeople%2F%3Fnetwork%3D%255B%2522F%2522%255D%26origin%3DMEMBER_PROFILE_CANNED_SEARCH&fromSignIn=true&trk=cold_join_sign_in"
10+
11+
paginationurl="https://www.linkedin.com/search/results/people/?network=%5B%22F%22%5D&origin=MEMBER_PROFILE_CANNED_SEARCH&page="
12+
13+
14+
15+
16+
# Function to read username and password from file
17+
def read_creds(filename):
18+
19+
"""This function reads username and password from creddentials.json
20+
21+
Arguments:
22+
filename: name of the file which stores the credentials
23+
24+
:return: returns the credentials
25+
"""
26+
with open(filename) as f:
27+
credentials = json.load(f)
28+
return credentials
29+
30+
31+
32+
# function to find skills
33+
def find_skills(url):
34+
"""This function find the skills from the linkedin profile of the connection
35+
36+
Arguments:
37+
url: url of the profile
38+
39+
:return: returns the list contsining skills
40+
"""
41+
browser.get(url)
42+
time.sleep(3)
43+
skill_set=[]
44+
last_height = browser.execute_script("return document.body.scrollHeight")
45+
46+
while(True):
47+
browser.execute_script("window.scrollTo(0, window.scrollY + 400);")
48+
time.sleep(2)
49+
50+
new_height = browser.execute_script("return window.scrollY + 400")
51+
52+
if new_height == last_height:
53+
break
54+
last_height = new_height
55+
56+
time.sleep(2)
57+
58+
Skills=browser.find_elements_by_class_name("pv-skill-category-entity")
59+
if(len(Skills)==0):
60+
skill_set.append("Any Skill is not mentioned in the Linkedin Profile")
61+
else:
62+
show_more_button=browser.find_elements_by_class_name("pv-skills-section__additional-skills")
63+
if(len(show_more_button)>0):
64+
browser.execute_script("arguments[0].click();", show_more_button[0])
65+
66+
pagesource=browser.page_source
67+
content=BeautifulSoup(pagesource,"html.parser")
68+
Skills=content.find_all("li",class_="pv-skill-category-entity")
69+
for skill in Skills:
70+
skill_text=skill.find("p",class_="pv-skill-category-entity__name").text.strip()
71+
skill_set.append(skill_text.split("\n")[0])
72+
return skill_set
73+
74+
75+
76+
77+
78+
# Function to login
79+
def login_and_npage():
80+
"""This function login to the linkedin page return the number of pages of connection
81+
82+
83+
:return: number of pages
84+
"""
85+
creds=read_creds('credentials.json')
86+
87+
#open login page
88+
browser.get(pageurl)
89+
elementID=browser.find_element_by_id("username")
90+
elementID.send_keys(creds['username'])
91+
elementID=browser.find_element_by_id("password")
92+
elementID.send_keys(creds['password'])
93+
elementID.submit()
94+
url =browser.current_url
95+
96+
browser.get(url)
97+
98+
connectionclass="mn-connection-card__details"
99+
100+
numberofpages=0
101+
102+
103+
time.sleep(2)
104+
last_height = browser.execute_script("return document.body.scrollHeight")
105+
106+
while True:
107+
# Scroll down to bottom
108+
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
109+
110+
elements=browser.find_elements_by_class_name("artdeco-pagination__indicator--number")
111+
112+
if(len(elements)>0):
113+
numberofpages=int(elements[-1].text)
114+
return numberofpages
115+
116+
117+
# Wait to load page
118+
time.sleep(2)
119+
120+
# Calculate new scroll height and compare with last scroll height
121+
new_height = browser.execute_script("return document.body.scrollHeight")
122+
123+
124+
if new_height == last_height:
125+
break
126+
last_height = new_height
127+
return 0
128+
129+
130+
# function to find connection data containing name and profileurl only
131+
def connectiondatahelper(numberofpages):
132+
"""This function returns the data of connections containg name and profile url
133+
134+
Arguments:
135+
numberofpages:number of pages of connections
136+
137+
:return: connectiondata
138+
"""
139+
140+
connectiondata=[]
141+
for j in range(1,numberofpages+1):
142+
browser.get(paginationurl+str(j))
143+
time.sleep(3)
144+
pagesource=browser.page_source;
145+
soup=BeautifulSoup(pagesource,'html.parser')
146+
connections=soup.find_all("div",class_="entity-result__content")
147+
148+
for connection in connections:
149+
titlecontainer=connection.find(class_="entity-result__title-text")
150+
nameelement=titlecontainer.find("a",class_="app-aware-link")
151+
nametext=nameelement.find("span",{"dir":"ltr"}).text.split("View")[0]
152+
url=nameelement["href"]
153+
connectionobj={"name":nametext,"profileurl":url}
154+
connectiondata.append(connectionobj)
155+
156+
return connectiondata
157+
158+
159+
160+
161+
# '''function for finding required data of connection'''
162+
def finalconnectiondata(connectiondata):
163+
"""This function returns the list containing the data of all connections
164+
165+
Arguments:
166+
connectiondata: list containing the name and profileurl of the connections
167+
168+
:return: finalconnectiondata
169+
"""
170+
final_connectiondata=[]
171+
172+
for connection in connectiondata:
173+
skill_set=find_skills(connection["profileurl"])
174+
browser.get(connection["profileurl"])
175+
time.sleep(3)
176+
177+
178+
pagesource=browser.page_source;
179+
180+
soup=BeautifulSoup(pagesource,'html.parser')
181+
182+
experiencecontainer=soup.find("a",{"data-control-name":"background_details_company"})
183+
184+
if(experiencecontainer==None):
185+
l_jobtitile="Don't Have any Job Experience"
186+
else:
187+
l_jobtitile=experiencecontainer.find("h3").text.strip();
188+
189+
last_height = browser.execute_script("return document.body.scrollHeight")
190+
191+
time.sleep(2)
192+
193+
linken_connection={"Name": connection["name"], "Linkedin_URL": connection["profileurl"], "Latest_JOB_Position": l_jobtitile, "Skills": skill_set}
194+
final_connectiondata.append(linken_connection)
195+
196+
return final_connectiondata
197+
198+
199+
def main():
200+
"""function of execution"""
201+
browser.maximize_window()
202+
numberofpages=login_and_npage()
203+
smalldata_of_connections=connectiondatahelper(numberofpages)
204+
final_connectiondata=finalconnectiondata(smalldata_of_connections)
205+
browser.quit();
206+
keys = final_connectiondata[0].keys()
207+
with open('output.csv', 'w', newline='') as output_file:
208+
209+
dict_writer = csv.DictWriter(output_file, keys)
210+
dict_writer.writeheader()
211+
dict_writer.writerows(final_connectiondata)
212+
213+
214+
215+
216+
if __name__ == "__main__":
217+
main()

0 commit comments

Comments
 (0)