Web Scraping with BeautifulSoup - Python

In this project, we're going to scrap data off Amazon for 1 item.¶
- That data will be imported to a csv file over time.
Importing Libraries¶
In [3]:
import pandas as pd
import requests
import csv
import smtplib
import time
import datetime
from bs4 import BeautifulSoup as bs
import urllib.request, urllib.error, urllib.parse
Connecting to Website and Pulling the Data¶
In [2]:
url ='https://www.amazon.ca/dp/B07K4VSLBS/ref=twister_B07K4W1KVW?_encoding=UTF8&th=1'
#connecting to the website
#Linnk for headers: https://httpbin.org/get
# Copy the User-agent line
headers = {
'content-type': 'text/html;charset=UTF-8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
#p pulling in the data and cleaning it up
page = requests.get(url, headers=headers)
soup1 = bs(page.content, "html.parser")
soup2 = bs(soup1.prettify(), "html.parser")
# locating the name and price of the item, also tidying up by removing the white space.
title = soup2.find(id='productTitle').get_text().strip()
price = soup2.select_one('span.a-price span').get_text().strip()[1:]
print(title)
print(price)
Kanto SP32PLW 32" Bookshelf Speaker Stands | White | Pair 199.99
Creating time stamp and CSV file with headers and data in file¶
In [3]:
# creating a timestamp for the output to track the data when collected
today = datetime.date.today()
print(today)
# creating headers for file
headers = ['Title', 'Price', 'Date']
data = [title, price, today]
# creating CSV file
# Run this code only on the initial run, after the first run comment it out as it will remove all the data in the CSV created.
#with open('Project_01_Web_Scraping.csv', 'w',newline="", encoding="UTF8") as f:
#writer = csv.writer(f)
#writer.writerow(headers)
#writer.writerow(data)
2023-01-03
In [4]:
dframe = pd.read_csv(r"C:\Users\Jessiah\Project_01_Web_Scraping.csv")
print (dframe)
Title Price Date 0 Kanto SP32PLW 32" Bookshelf Speaker Stands | W... 199.99 2022-12-30
Appending data to the CSV¶
In [5]:
# adding/appending the file with new collected data.
with open('Project_01_Web_Scraping.csv', 'a+', newline='', encoding='UTF8') as f:
writer = csv.writer(f)
writer.writerow(data)
Combining all of the above code into one function¶
In [4]:
# combining all the above into one function.
def check_price():
url = 'https://www.amazon.ca/dp/B07K4VSLBS/ref=twister_B07K4W1KVW?_encoding=UTF8&th=1'
headers = {
'content-type': 'text/html;charset=UTF-8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
page = requests.get(url, headers=headers)
soup1 = bs(page.content, "html.parser")
soup2 = bs(soup1.prettify(), "html.parser")
title = soup2.find(id='productTitle').get_text().strip()
price = soup2.select_one('span.a-price span').get_text().strip()[1:]
import datetime
today = datetime.date.today()
import csv
headers = ['Title', 'Price', 'Date']
data = [title, price, today]
with open('Project_01_Web_Scraping.csv', 'a+', newline='', encoding='UTF8') as f:
writer = csv.writer(f)
writer.writerow(data)
runs check_price after a set time¶
In [ ]:
# runs check_price after a set time
while (True):
check_price()
time.sleep(86_400)
In [1]:
import pandas as pd
df = pd.read_csv(r'C:\Users\Jessiah\Project_01_Web_Scraping.csv')
print(df)
Title Price Date 0 Kanto SP32PLW 32" Bookshelf Speaker Stands | W... 199.99 2022-12-30
In [ ]: