Web scraping using Beatiful Soup
I have this code and I wanna extract holidays, petrol and temperature but I don't know where is the problem. I need your help as soon as possible, please. I want to add this extraction to my dataset that is based on date columns, so comparing the scraping data with the dates that I have in my dataset. I also wanna test the impact of each variable holidays, temperature...
import requests
import re
import json
import datefinder
from googletrans import Translator
import datetime
def web_scraping(user_data, dateColumn, country , weather=False, holidays=True, petrole=False) :
start_time = time.time()
df = user_data.copy()
if holidays :
## suppose date column is converted to datetime
print(Adding holidays data ......)
translator = Translator()
country_en = translator.translate(country, dest='en').text.lower()
url = f'https://www.timeanddate.com/holidays/{country_en}/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
holidays = []
for i in range(4,len(soup.find_all('th'))) :
for date in holidays :
r = datefinder.find_dates(date)
for _ in r :
holidays[j] = _
holidays_df = pd.DataFrame({'holiday date': np.asanyarray(holidays)})
holidays_df['holiday_month'] = holidays_df['holiday date'].apply(lambda x: x.month)
holidays_df['holiday_day'] = holidays_df['holiday date'].apply(lambda x: x.day)
df['is_holiday'] = df[dateColumn]
for i in range(0, df.shape[0]) :
for j in range(0, holidays_df.shape[0]) :
if (df.loc[i, dateColumn].month == holidays_df.loc[j, 'holiday_month']) (df.loc[i, dateColumn].day == holidays_df.loc[j, 'holiday_day']) :
df.loc[i, 'is_holiday'] = 1
else :
df.loc[i, 'is_holiday'] = 0
if weather :
print(Adding weather data ......)
#url = 'https://www.wunderground.com/history/daily/ma/nouaceur/GMMN/date/2008-3-24'
#page = requests.get(url)
#soup = BeautifulSoup(page.content, 'html.parser')
df['temp_moy'] = df[dateColumn]
scrap_months = list(np.arange(1,384,32))
for i in range(0,df.shape[0]) :
year_data = str(df.loc[i, dateColumn].year)
month_data = df.loc[i, dateColumn].month
if (int(year_data) = 2009) :
url = f'https://www.historique-meteo.net/afrique/{country.lower()}/{year_data}/'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
df.loc[i, 'temp_moy'] = soup.find_all('td')[scrap_months[month_data - 1]].text[:2]
else :
df.loc[i, 'temp_moy'] = np.nan
print(Progress : , i, '%')
if petrole :
print(Adding petrol data ......)
df['petrole_USD'] = df[dateColumn]
for i in range(0, df.shape[0]) :
##convert to posix date integrable in url
cur_date = df.loc[i, dateColumn]
cur_date_unix = int(time.mktime(cur_date.timetuple()))
next_date = cur_date + datetime.timedelta(days=1)
next_date_unix = int(time.mktime(next_date.timetuple()))
## dynamic web scraping
url_petrole = f'https://query2.finance.yahoo.com/v8/finance/chart/CL=F?formatted=truecrumb=RoQtzbt66M5lang=en-USregion=USinterval=1dperiod1={cur_date_unix}period2={next_date_unix}events=div%7CsplitcorsDomain=finance.yahoo.com'
result_p = requests.get(url_petrole, headers={'Referer': 'https://finance.yahoo.com/quote/CL%3DF/history?p=CL%3DF'})
if result_p.json()['chart']['result'][0]['indicators']['adjclose'] != None :
adj_close_dict = result_p.json()['chart']['result'][0]['indicators']['adjclose'][0]
if len(adj_close_dict) == 0 :
df.loc[i, 'petrole_USD'] = np.nan
elif len(adj_close_dict['adjclose']) == 1 :
df.loc[i,'petrole_USD'] = adj_close_dict['adjclose'][0]
else :
length = len(adj_close_dict['adjclose'])
df.loc[i,'petrole_USD'] = adj_close_dict['adjclose'][length - 1]
else :
df.loc[i, 'petrole_USD'] = np.nan
print(CPU time for the third sub-module : %s seconds % (time.time() - start_time))
return df ```
Topic web-scraping forecasting time-series python machine-learning
Category Data Science