Skip to content

Commit 7c56913

Browse files
authored
Create webscraping.py
1 parent ebeaede commit 7c56913

File tree

1 file changed

+114
-0
lines changed

1 file changed

+114
-0
lines changed
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# HTML Page Read and Upload
2+
# Import useful libraries and classes.
3+
4+
from urllib.request import urlopen as uReq
5+
from bs4 import BeautifulSoup as soup
6+
7+
# html page upload and read in web_page variable.
8+
my_url= "https://www.flipkart.com/search?p%5B%5D=facets.brand%255B%255D%3DSamsung&sid=tyy%2F4io&sort=recency_desc&wid=1.productCard.PMU_V2_1"
9+
web_page= uReq(my_url)
10+
page_html= web_page.read()
11+
12+
# Parsing
13+
# html parser. It is to beautify the HTML code.
14+
page_soup= soup(page_html)
15+
16+
# Extraction of information
17+
# read class attribute from web page in containers variable.
18+
# Print the length of containers.
19+
20+
containers= page_soup.findAll("div", {"class": "_2kHMtA"})
21+
print(len(containers))
22+
23+
# Extracting Product Name
24+
# The product_name_list contains the name of product extracted using find function using div tag and
25+
# class name
26+
27+
product_name_list = []
28+
29+
for box in containers:
30+
# name of product is extracted using div tag with class name as given in website
31+
product_name = box.find("div", class_="_4rR01T")
32+
# the extracted names is stored in a list
33+
product_name_list.append(product_name.string)
34+
35+
# Extracting Ratings
36+
# The rating_list contains the ratings of product extracted using find function using div tag and class name
37+
38+
rating_list = []
39+
40+
for box in containers:
41+
# rating of product is extracted using div tag with class name as given in website, if the rating is None, then 0.0 is used.
42+
rating = box.find("div", class_="_3LWZlK")
43+
if rating != None:
44+
rating_list.append(rating.text)
45+
else:
46+
rating_list.append('0.0')
47+
# Extracting Price of Product
48+
# The price_list contains the price of product extracted using find function using div tag and class name
49+
price_list = []
50+
51+
for box in containers:
52+
# price of product is extracted using div tag with class name as given in website
53+
price = box.find("div", class_="_30jeq3")
54+
# the extracted price is stored in a list after string Rupees sign.
55+
price_list.append(price.string.strip('₹'))
56+
57+
# The container in website contains a list of information of the product which is
58+
# extracted using find function using li tag and class name
59+
# here n is length of containers in 1 page.
60+
61+
n = len(containers)
62+
63+
# list to store RAM of phones
64+
ram_list = []
65+
# list to store ROM of phones
66+
rom_list = []
67+
# list to store Display Screen of phones
68+
display_list = []
69+
# list to store Camera Specification of phones
70+
camera_list = []
71+
# list to store Battery Life of phones
72+
battery_life_list = []
73+
# list to store Warranty Period of phones
74+
warranty_list = []
75+
# temporary list to store the all the list of phones's specifications
76+
temp_list = []
77+
78+
79+
for box in containers:
80+
# one list out of all product list is extracted using li tag with class name as given in website
81+
temp_box = box.findAll("li", class_="rgWa7D")
82+
temp_list.append(temp_box)
83+
84+
for i in range(n):
85+
# this loop extracts the values stored in the list of one container.
86+
# since in the website the RAM & ROM of phoes are listed together
87+
# so it is stored in a list and then splitted as per given splittor element.
88+
split_list = temp_list[i][0].string.split('|')
89+
# the extracted RAM is stored in a list
90+
ram_list.append(split_list[0])
91+
# the extracted ROM is stored in a list
92+
rom_list.append(split_list[1])
93+
# the extracted display is stored in a list
94+
display_list.append(temp_list[i][1].string)
95+
# the extracted camera is stored in a list
96+
camera_list.append(temp_list[i][2].string)
97+
# the extracted battery is stored in a list
98+
battery_life_list.append(temp_list[i][3].string)
99+
# the extracted warranty is stored in a list
100+
warranty_list.append(temp_list[i][-1].string)
101+
102+
# Creating Pandas DataFrame from Data scraped from Web
103+
# Importing Pandas to create a DataFrame
104+
import pandas as pd
105+
# Creating a Dictionary to store List values and creating DataFrame
106+
dictionary = {'Product_Name':product_name_list, 'Ratings':rating_list, 'Price':price_list, 'RAM_Storage':ram_list,
107+
'ROM_Storage':rom_list, 'Display_Screen':display_list, 'Camera':camera_list, 'Battery_Life':battery_life_list,
108+
'Warranty_Life':warranty_list}
109+
dataframe = pd.DataFrame(dictionary)
110+
# Head of DataFrame
111+
dataframe.head()
112+
113+
# Tail of DataFrame
114+
dataframe.tail()

0 commit comments

Comments
 (0)