1
1
#! /usr/bin/env python3
2
- #Instagram Scraper
2
+ # Instagram Scraper
3
+ import argparse
4
+ from banner import banner
3
5
from bs4 import BeautifulSoup
4
- import requests , random , sys , json , time , os , argparse
6
+ import json
7
+ import os
8
+ import requests
9
+ import random
10
+ import string
11
+ import sys
12
+ import time
5
13
6
14
7
15
class colors :
@@ -17,149 +25,180 @@ class colors:
17
25
18
26
class Scraper :
19
27
20
-
21
- def __init__ ( self ):
22
- self . user_agents = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
23
- 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
24
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
25
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14' ,
26
- 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' ,
27
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ,
28
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ,
29
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' ,
30
- 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
31
- 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' ]
28
+ def __init__ ( self , username ):
29
+ self . user_agents = [
30
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
31
+ 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
32
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
33
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14' ,
34
+ 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' ,
35
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ,
36
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ,
37
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36' ,
38
+ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' ,
39
+ 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' ]
32
40
self .profile_data = {}
41
+ self .username = username
42
+ self .make_directory ()
43
+ self .scrape (self .username )
44
+ self .print_data ()
45
+ self .save_data ()
33
46
34
47
35
- def convert_to_int (self , num : str ):
36
- '''Converts values like 11.9k to 11900 because instagram shortens
37
- their follower count, this currently does not work and idk how to fix it'''
38
- if "k" in num :
39
- #Find the first few digits that should be * 1000
40
- try :
41
- front = int (num [:num .index ('.' )])
42
- back = int (num [num .index ('.' )+ 1 ])
43
- except ValueError :
44
- return (front * 1000 )
45
-
46
- return (front * 1000 ) + (back * 100 )
47
-
48
- elif "m" in num :
49
- try :
50
- front = int (num [:num .index ('.' )])
51
- back = int (num [num .index ('.' )+ 1 ])
52
- except ValueError :
53
- return (front * 1000000 )
54
-
55
- return (front * 1000000 ) + (back * 100000 )
56
-
57
- else :
58
- return int (num .replace (',' , '' ))
59
-
60
48
61
- def scrape (self ,username :str ):
62
- '''Takes a username as a string to find information about that person's instagram profile, a random
63
- user agent is picked to spoof when the data is collected'''
64
- time .sleep (2 )
65
- #Get the html data with the requests module
49
+ def scrape (self , username : str ):
50
+ """Takes a username as a string to find information about that person's instagram profile, a random
51
+ user agent is picked to spoof when the data is collected
52
+ :return: none
53
+ :param: username: str
54
+ """
55
+ print ("[*] Starting Scan" )
56
+ # Get the html data with the requests module
66
57
r = requests .get (f'http://instagram.com/{ username } ' , headers = {'User-Agent' : random .choice (self .user_agents )})
67
- soup = BeautifulSoup (r .text ,'html.parser' )
68
- #Find the tags that hold the data we want to parse
69
- general_data = soup .find_all ('meta' ,attrs = {'property' :'og:description' })
70
- more_data = soup .find_all ('script' ,attrs = {'type' :'text/javascript' })
58
+ soup = BeautifulSoup (r .text , 'html.parser' )
59
+ # Find the tags that hold the data we want to parse
60
+ general_data = soup .find_all ('meta' , attrs = {'property' : 'og:description' })
61
+ more_data = soup .find_all ('script' , attrs = {'type' : 'text/javascript' })
71
62
description = soup .find ('script' , attrs = {'type' : 'application/ld+json' })
72
- #Try to parse the nessicary content but if it fails, then user != exist
63
+ # Try to parse the nessicary content but if it fails, then user != exist
73
64
try :
74
65
text = general_data [0 ].get ('content' ).split ()
75
- #Get the json data held inside of the <script> type="applicaiton/il/json"
66
+ # Get the json data held inside of the <script> type="applicaiton/il/json"
76
67
description = json .loads (description .get_text ())
77
68
profile_meta = json .loads (more_data [3 ].get_text ()[21 :].strip (';' ))
78
69
79
70
except :
80
71
print (colors .FAIL + f"Username { username } not found" + colors .ENDC )
81
72
sys .exit ()
82
73
83
- #If the user does not have anything in their bio, the value will not be in the json dump
84
- #So we just set the bio to an empty string
74
+ # If the user does not have anything in their bio, the value will not be in the json dump
75
+ # So we just set the bio to an empty string
76
+ # I don't know if I still need this try/catch block atm
85
77
try :
86
- self .profile_data = {"Username" : text [- 1 ], "Profile name" : description ['name' ], "URL" : description ['mainEntityofPage' ]['@id' ],
87
- "Followers" : text [0 ], "Following" : text [2 ], "Posts" : text [4 ], "Bio" : description ['description' ],
88
- "profile_pic_url" : profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['profile_pic_url_hd' ],
89
- "is_business_account" : profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_business_account' ],
90
- "connected_to_fb" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['connected_fb_page' ]),
91
- "externalurl" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['external_url' ]),
92
- "joined_recently" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_joined_recently' ]),
93
- "business_category_name" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['business_category_name' ]),
94
- "is_private" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_private' ]),
95
- "is_verified" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_verified' ])}
78
+ self .profile_data = {"Username" : text [- 1 ], "Profile name" : description ['name' ],
79
+ "URL" : description ['mainEntityofPage' ]['@id' ],
80
+ "Followers" : text [0 ], "Following" : text [2 ], "Posts" : text [4 ],
81
+ "Bio" : str (
82
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['biography' ]),
83
+ "profile_pic_url" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
84
+ 'profile_pic_url_hd' ]),
85
+ "is_business_account" : str (
86
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
87
+ 'is_business_account' ]),
88
+ "connected_to_fb" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
89
+ 'connected_fb_page' ]),
90
+ "externalurl" : str (
91
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['external_url' ]),
92
+ "joined_recently" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
93
+ 'is_joined_recently' ]),
94
+ "business_category_name" : str (
95
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
96
+ 'business_category_name' ]),
97
+ "is_private" : str (
98
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_private' ]),
99
+ "is_verified" : str (
100
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_verified' ])}
96
101
except KeyError :
97
- profile_data = {"Username" : text [- 1 ], "Profile name" : description ['name' ],
98
- "Followers" : text [0 ], "Following" : text [2 ], "Posts" : text [4 ], "Bio" : '' , "URL" :description ['mainEntityofPage' ]['@id' ], "ProfilePictureURL" :description ['image' ]}
102
+ self .profile_data = {"Username" : text [- 1 ], "Profile name" : description ['name' ],
103
+ "URL" : description ['mainEntityofPage' ]['@id' ],
104
+ "Followers" : text [0 ], "Following" : text [2 ], "Posts" : text [4 ],
105
+ "Bio" : '' ,
106
+ "profile_pic_url" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
107
+ 'profile_pic_url_hd' ]),
108
+ "is_business_account" : str (
109
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
110
+ 'is_business_account' ]),
111
+ "connected_to_fb" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
112
+ 'connected_fb_page' ]),
113
+ "externalurl" : str (
114
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['external_url' ]),
115
+ "joined_recently" : str (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
116
+ 'is_joined_recently' ]),
117
+ "business_category_name" : str (
118
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ][
119
+ 'business_category_name' ]),
120
+ "is_private" : str (
121
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_private' ]),
122
+ "is_verified" : str (
123
+ profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['is_verified' ])}
124
+ # Tries to scrape posts if it is a public profile
125
+ self .scrape_posts (profile_meta )
126
+
127
+
128
+ def scrape_posts (self , profile_meta : str ):
129
+ """Scrapes all posts and downloads thumbnails when necessary
130
+ :return: none
131
+ """
132
+ if self .profile_data ['is_private' ].lower () == 'true' :
133
+ print ("Private profile, cannot scrape photos!" )
134
+ else :
135
+ posts = {}
136
+ for index , post in enumerate (profile_meta ['entry_data' ]['ProfilePage' ][0 ]['graphql' ]['user' ]['edge_owner_to_timeline_media' ]['edges' ]):
137
+ posts [index ] = {"Caption" : str (post ['node' ]['edge_media_to_caption' ]['edges' ][0 ]['node' ]['text' ]),
138
+ "Number of Comments" : str (post ['node' ]['edge_media_to_comment' ]['count' ]),
139
+ "Comments Disabled" : str (post ['node' ]['comments_disabled' ]),
140
+ "Taken At Timestamp" : str (post ['node' ]['taken_at_timestamp' ]),
141
+ "Number of Likes" : str (post ['node' ]['edge_liked_by' ]['count' ]),
142
+ "Location" : str (post ['node' ]['location' ]),
143
+ "Accessability Caption" : str (post ['node' ]['accessibility_caption' ])
144
+ }
145
+
146
+ # Downloads the thumbnails of the post
147
+ for url in post ['node' ]['thumbnail_resources' ]:
148
+ # Picture is just an int index of the url in the list
149
+ with open ('' .join ([random .choice (string .ascii_uppercase ) for x in range (random .randint (1 , 9 ))]) + '.jpg' , 'wb' ) as f :
150
+ # Delay the request times randomly (be nice to Instagram)
151
+ time .sleep (random .randint (2 , 16 ))
152
+ r = requests .get (url ['src' ])
153
+ f .write (r .content )
154
+ print ("Got an Image" )
155
+
156
+ with open ('posts.txt' , 'w' ) as f :
157
+ f .write (json .dumps (posts ))
99
158
100
- self .print_data ()
101
- self .make_directory ()
102
- self .download_profile_picture ()
103
- self .save_data ()
104
159
105
160
def make_directory (self ):
106
- """Makes the profile directory of the profile being searched
161
+ """Makes the profile directory and changes the cwd to it
107
162
:return: True
108
163
"""
109
164
try :
110
- os .mkdir (self .profile_data ['Username' ])
165
+ os .mkdir (self .username )
166
+ os .chdir (self .username )
111
167
except FileExistsError :
112
- print ("Error, directory exists!" )
113
- sys .exit ()
114
-
168
+ os .chdir (self .username )
115
169
116
170
def save_data (self ):
117
171
"""Saves the data to the uname directory
118
172
:return: none
119
173
:param: none
120
174
"""
121
- os .chdir (self .profile_data ['Username' ])
122
- with open ('data.txt' ,'w' ) as f :
175
+ with open ('data.txt' , 'w' ) as f :
123
176
f .write (json .dumps (self .profile_data ))
177
+ self .download_profile_picture ()
124
178
print (f"Saved data to directory { os .getcwd ()} " )
125
179
126
180
return True
127
181
128
182
def print_data (self ):
129
- """Prints out the data to the screen
183
+ """Prints out the data to the screen by iterating through the dict with it's key and value
130
184
:return: True
131
185
"""
132
- #Print the data out to the user
186
+ # Print the data out to the user
133
187
print (colors .HEADER + "---------------------------------------------" + colors .ENDC )
134
188
print (colors .OKGREEN + f"Results: scan for { self .profile_data ['Username' ]} on instagram" + colors .ENDC )
135
- print (f"""Username:{ self .profile_data ["Username" ]} """ )
136
- print (f"URL:{ self .profile_data ['URL' ]} " )
137
- print (f"Profile name: { self .profile_data ['Profile name' ]} " )
138
- print (f"Followers:{ self .profile_data ['Followers' ]} " )
139
- print (f"Following:{ self .profile_data ['Following' ]} " )
140
- print (f"Posts:{ self .profile_data ['Posts' ]} " )
141
- #If the user does not have anything in their bio, the value will not be in the json dump
142
- #So we just set the bio to an empty string
143
- try :
144
- print (f"Profile Bio:{ self .profile_data ['Bio' ]} " )
145
- except KeyError :
146
- self .profile_data ['Bio' ] = ''
147
- print ("Profile Bio: ''" )
148
- print ("" )
149
-
189
+ for key , value in self .profile_data .items ():
190
+ print (key + ':' + value )
150
191
151
192
def download_profile_picture (self ):
152
193
"""Downloads the profile pic and saves it to the directory
153
194
:return: none
154
195
:param: none
155
196
"""
156
- os .chdir (self .profile_data ['Username' ])
157
- with open ("profile_pic.jpg" ,"wb" ) as f :
197
+ with open ("profile_pic.jpg" , "wb" ) as f :
158
198
r = requests .get (self .profile_data ['profile_pic_url' ])
159
199
f .write (r .content )
160
200
161
201
162
-
163
202
def parse_args ():
164
203
parser = argparse .ArgumentParser (description = "Instagram OSINT tool" )
165
204
parser .add_argument ("--username" , help = "profile username" , required = True , nargs = 1 )
@@ -168,16 +207,13 @@ def parse_args():
168
207
169
208
def main ():
170
209
args = parse_args ()
210
+ print (banner )
171
211
if args .username == '' :
172
212
print ("Please enter the username" )
173
213
sys .exit ()
174
214
else :
175
- s = Scraper ()
176
- s .scrape (args .username [0 ])
215
+ s = Scraper (args .username [0 ])
177
216
178
217
179
218
if __name__ == '__main__' :
180
- main ()
181
-
182
-
183
-
219
+ main ()
0 commit comments