Skip to content

Commit d35280b

Browse files
amazon web scrapper
tracking daily price changes
1 parent 98c213f commit d35280b

File tree

1 file changed

+303
-0
lines changed

1 file changed

+303
-0
lines changed
Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "6be2133d",
6+
"metadata": {},
7+
"source": [
8+
"# Amazon Web Scrapping\n",
9+
"## Obtaining daily prices changes for a product (book)"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 12,
15+
"id": "46eef088",
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"# import libraries\n",
20+
"\n",
21+
"from bs4 import BeautifulSoup\n",
22+
"import requests\n",
23+
"import smtplib\n",
24+
"import time\n",
25+
"import datetime\n"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": 22,
31+
"id": "b461026c",
32+
"metadata": {},
33+
"outputs": [
34+
{
35+
"name": "stdout",
36+
"output_type": "stream",
37+
"text": [
38+
"\n",
39+
" 2018 Apple iPad (9.7-inch, WiFi, 32GB) - Space Grey (Renewed)\n",
40+
" \n",
41+
"\n",
42+
" £175.00\n",
43+
" \n"
44+
]
45+
}
46+
],
47+
"source": [
48+
"# connect to the product website/link\n",
49+
"URL = \"https://www.amazon.co.uk/Apple-iPad-WI-FI-32GB-Refurbished/dp/B07NYS898H/ref=zg-bs_amazon-renewed_sccl_3/258-0469945-6067500?pd_rd_w=WxkkY&content-id=amzn1.sym.401f1a3a-5fa9-46fb-9ed2-7c7d241a11cd&pf_rd_p=401f1a3a-5fa9-46fb-9ed2-7c7d241a11cd&pf_rd_r=43YKHB1FR44WPFEHVM75&pd_rd_wg=iNlum&pd_rd_r=d20b281e-2b5c-4285-a827-bcfe4f3319dc&pd_rd_i=B07NYS898H&psc=1\"\n",
50+
"\n",
51+
"headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.42\", \"Accept-Encoding\":\"gzip, deflate\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\", \"DNT\":\"1\",\"Connection\":\"close\", \"Upgrade-Insecure-Requests\":\"1\"}\n",
52+
"# obtained user agent from https://httpbin.org/get and obtained user agent \n",
53+
"\n",
54+
"page = requests.get(URL, headers=headers)\n",
55+
"\n",
56+
"# pulling the content from the page and formatting it better with prettify\n",
57+
"soup1 = BeautifulSoup(page.content, \"html.parser\")\n",
58+
"soup2 = BeautifulSoup(soup1.prettify(), \"html.parser\")\n",
59+
"\n",
60+
"# specify content we want\n",
61+
"title = soup2.find(id='productTitle').get_text()\n",
62+
"price = soup2.find(id='renewedBuyBoxPrice').get_text()\n",
63+
"\n",
64+
"# view data obtained\n",
65+
"print(title)\n",
66+
"print(price)"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 23,
72+
"id": "40059151",
73+
"metadata": {},
74+
"outputs": [
75+
{
76+
"name": "stdout",
77+
"output_type": "stream",
78+
"text": [
79+
"2018 Apple iPad (9.7-inch, WiFi, 32GB) - Space Grey (Renewed)\n",
80+
"175.00\n"
81+
]
82+
}
83+
],
84+
"source": [
85+
"# clean output\n",
86+
"#strip to remove extra spaces and index on price to remove pound sign\n",
87+
"\n",
88+
"price = price.strip()[1:]\n",
89+
"title = title.strip()\n",
90+
"\n",
91+
"print(title)\n",
92+
"print(price)"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": 24,
98+
"id": "e26fc53a",
99+
"metadata": {},
100+
"outputs": [
101+
{
102+
"name": "stdout",
103+
"output_type": "stream",
104+
"text": [
105+
"2022-12-22\n"
106+
]
107+
}
108+
],
109+
"source": [
110+
"# Obtain date when price is extracted\n",
111+
"\n",
112+
"import datetime\n",
113+
"\n",
114+
"today = datetime.date.today()\n",
115+
"\n",
116+
"print(today)"
117+
]
118+
},
119+
{
120+
"cell_type": "code",
121+
"execution_count": 25,
122+
"id": "f71e642a",
123+
"metadata": {},
124+
"outputs": [],
125+
"source": [
126+
"# create csv to import data obtained\n",
127+
"\n",
128+
"import csv\n",
129+
"\n",
130+
"header = ['Title', 'Price', 'Date']\n",
131+
"data = [title, price, today]\n",
132+
"\n",
133+
"# note: w = write, newline = no space when adding data\n",
134+
"with open('AmazonWebScraperDataset.csv', 'w', newline='', encoding='UTF8') as f:\n",
135+
" writer = csv.writer(f)\n",
136+
" writer.writerow(header)\n",
137+
" writer.writerow(data)\n",
138+
" \n"
139+
]
140+
},
141+
{
142+
"cell_type": "code",
143+
"execution_count": 26,
144+
"id": "d3b647c3",
145+
"metadata": {},
146+
"outputs": [
147+
{
148+
"name": "stdout",
149+
"output_type": "stream",
150+
"text": [
151+
" Title Price Date\n",
152+
"0 2018 Apple iPad (9.7-inch, WiFi, 32GB) - Space... 175.0 2022-12-22\n"
153+
]
154+
}
155+
],
156+
"source": [
157+
"# view csv data\n",
158+
"\n",
159+
"import pandas as pd\n",
160+
"\n",
161+
"df = pd.read_csv(r'C:\\Users\\cheil\\AmazonWebScraperDataset.csv')\n",
162+
"\n",
163+
"print(df)"
164+
]
165+
},
166+
{
167+
"cell_type": "code",
168+
"execution_count": 27,
169+
"id": "79cfcf7a",
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"# append new data to csv\n",
174+
"\n",
175+
"# note: a+ = append\n",
176+
"with open('AmazonWebScraperDataset.csv', 'a+', newline='', encoding='UTF8') as f:\n",
177+
" writer = csv.writer(f)\n",
178+
" writer.writerow(data)\n"
179+
]
180+
},
181+
{
182+
"cell_type": "code",
183+
"execution_count": 30,
184+
"id": "eb0a3f26",
185+
"metadata": {},
186+
"outputs": [],
187+
"source": [
188+
"# create function to automate the price check using the code we used above\n",
189+
"\n",
190+
"def check_price():\n",
191+
" # get website content\n",
192+
" URL = \"https://www.amazon.co.uk/Apple-iPad-WI-FI-32GB-Refurbished/dp/B07NYS898H/ref=zg-bs_amazon-renewed_sccl_3/258-0469945-6067500?pd_rd_w=WxkkY&content-id=amzn1.sym.401f1a3a-5fa9-46fb-9ed2-7c7d241a11cd&pf_rd_p=401f1a3a-5fa9-46fb-9ed2-7c7d241a11cd&pf_rd_r=43YKHB1FR44WPFEHVM75&pd_rd_wg=iNlum&pd_rd_r=d20b281e-2b5c-4285-a827-bcfe4f3319dc&pd_rd_i=B07NYS898H&psc=1\"\n",
193+
" headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.42\", \"Accept-Encoding\":\"gzip, deflate\", \"Accept\":\"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\", \"DNT\":\"1\",\"Connection\":\"close\", \"Upgrade-Insecure-Requests\":\"1\"}\n",
194+
" page = requests.get(URL, headers=headers)\n",
195+
"\n",
196+
" soup1 = BeautifulSoup(page.content, \"html.parser\")\n",
197+
" soup2 = BeautifulSoup(soup1.prettify(), \"html.parser\")\n",
198+
"\n",
199+
" title = soup2.find(id='productTitle').get_text()\n",
200+
" price = soup2.find(id='renewedBuyBoxPrice').get_text()\n",
201+
"\n",
202+
" # get information we need\n",
203+
" price = price.strip()[1:]\n",
204+
" title = title.strip()\n",
205+
"\n",
206+
" # obtain date price is checked\n",
207+
" import datetime\n",
208+
" today = datetime.date.today() \n",
209+
"\n",
210+
" # append new data into csv\n",
211+
" import csv \n",
212+
" header = ['Title', 'Price', 'Date']\n",
213+
" data = [title, price, today]\n",
214+
"\n",
215+
" with open('AmazonWebScraperDataset.csv', 'a+', newline='', encoding='UTF8') as f:\n",
216+
" writer = csv.writer(f)\n",
217+
" writer.writerow(data)\n",
218+
" \n",
219+
" # if(price < 150):\n",
220+
" # send_mail()"
221+
]
222+
},
223+
{
224+
"cell_type": "code",
225+
"execution_count": null,
226+
"id": "a8f84329",
227+
"metadata": {
228+
"scrolled": true
229+
},
230+
"outputs": [],
231+
"source": [
232+
"# check price daily and automatically update csv\n",
233+
"\n",
234+
"while(True):\n",
235+
" check_price()\n",
236+
" time.sleep(86400) #repeats daily"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": null,
242+
"id": "bd53dc4b",
243+
"metadata": {},
244+
"outputs": [],
245+
"source": [
246+
"# look at csv data \n",
247+
"\n",
248+
"import pandas as pd\n",
249+
"df = pd.read_csv(r'C:\\Users\\cheil\\AmazonWebScraperDataset.csv')\n",
250+
"\n",
251+
"print(df)"
252+
]
253+
},
254+
{
255+
"cell_type": "code",
256+
"execution_count": null,
257+
"id": "914ad6f3",
258+
"metadata": {},
259+
"outputs": [],
260+
"source": [
261+
"# in case I want to get an email when the price goes down\n",
262+
"\n",
263+
"def send_mail():\n",
264+
" server = smtplib.SMTP_SSL('smtp.gmail.com',465)\n",
265+
" server.ehlo()\n",
266+
" #server.starttls()\n",
267+
" server.ehlo()\n",
268+
" server.login('[email protected]','xxxxxxxxxxxxxx')\n",
269+
" \n",
270+
" subject = \"That ipad you were looking at is now below £150!\"\n",
271+
" body = \"Cheila, the product is at a reduced price!\n",
272+
" msg = f\"Subject: {subject}\\n\\n{body}\"\n",
273+
" \n",
274+
" server.sendmail(\n",
275+
276+
" msg\n",
277+
" \n",
278+
" )"
279+
]
280+
}
281+
],
282+
"metadata": {
283+
"kernelspec": {
284+
"display_name": "Python 3 (ipykernel)",
285+
"language": "python",
286+
"name": "python3"
287+
},
288+
"language_info": {
289+
"codemirror_mode": {
290+
"name": "ipython",
291+
"version": 3
292+
},
293+
"file_extension": ".py",
294+
"mimetype": "text/x-python",
295+
"name": "python",
296+
"nbconvert_exporter": "python",
297+
"pygments_lexer": "ipython3",
298+
"version": "3.9.13"
299+
}
300+
},
301+
"nbformat": 4,
302+
"nbformat_minor": 5
303+
}

0 commit comments

Comments
 (0)