Web scraping using Beatiful Soup

Question

Web scraping using Beatiful Soup

IKRAM EL MBARKI

2020年7月2日 13:07

I have this code and I wanna extract holidays, petrol and temperature but I don't know where is the problem. I need your help as soon as possible, please. I want to add this extraction to my dataset that is based on date columns, so comparing the scraping data with the dates that I have in my dataset. I also wanna test the impact of each variable holidays, temperature...

import requests
import re
import json
import datefinder
from googletrans import Translator
import datetime



def web_scraping(user_data, dateColumn, country , weather=False, holidays=True, petrole=False) : 
    
    start_time = time.time()
    df = user_data.copy()
    
    
    if holidays : 
        
        ## suppose date column is converted to datetime
        
        print()
        print(Adding holidays data ......)
        translator = Translator()
        country_en = translator.translate(country, dest='en').text.lower()
        
        url = f'https://www.timeanddate.com/holidays/{country_en}/'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        holidays = []
        for i in range(4,len(soup.find_all('th'))) :
            holidays.append(soup.find_all('th')[i].text)

        j=0
        for date in holidays :
            r = datefinder.find_dates(date)
            for _ in r :
                holidays[j] = _

            j+=1

        holidays_df = pd.DataFrame({'holiday date': np.asanyarray(holidays)})
        holidays_df['holiday_month'] = holidays_df['holiday date'].apply(lambda x: x.month)
        holidays_df['holiday_day'] = holidays_df['holiday date'].apply(lambda x: x.day)

        df['is_holiday'] = df[dateColumn]
        for i in range(0, df.shape[0]) :
            for j in range(0, holidays_df.shape[0]) : 
                if (df.loc[i, dateColumn].month == holidays_df.loc[j, 'holiday_month'])  (df.loc[i, dateColumn].day == holidays_df.loc[j, 'holiday_day']) :
                    df.loc[i, 'is_holiday'] = 1
                else :
                    df.loc[i, 'is_holiday'] = 0
    
    
    if weather : 
        
        print()
        print(Adding weather data ......)
        
        #url = 'https://www.wunderground.com/history/daily/ma/nouaceur/GMMN/date/2008-3-24'
        #page = requests.get(url)
        #soup = BeautifulSoup(page.content, 'html.parser')
        
        
        df['temp_moy'] = df[dateColumn]
        scrap_months = list(np.arange(1,384,32))
        for i in range(0,df.shape[0]) : 

            year_data = str(df.loc[i, dateColumn].year)
            month_data = df.loc[i, dateColumn].month
            

            if (int(year_data) = 2009) :

                url = f'https://www.historique-meteo.net/afrique/{country.lower()}/{year_data}/'
                page = requests.get(url)
                soup = BeautifulSoup(page.content, 'html.parser')
                df.loc[i, 'temp_moy'] = soup.find_all('td')[scrap_months[month_data - 1]].text[:2] 
            
            else :
    
                df.loc[i, 'temp_moy'] = np.nan
            print(Progress :  , i, '%')
            
    
    if petrole : 
        
        print()
        print(Adding petrol data ......)
        df['petrole_USD'] = df[dateColumn]
        for i in range(0, df.shape[0]) : 
            
            
            ##convert to posix date integrable in url 
            
            cur_date = df.loc[i, dateColumn]
            cur_date_unix = int(time.mktime(cur_date.timetuple()))
            next_date = cur_date + datetime.timedelta(days=1)
            next_date_unix = int(time.mktime(next_date.timetuple()))
    
            ## dynamic web scraping
            
            url_petrole = f'https://query2.finance.yahoo.com/v8/finance/chart/CL=F?formatted=truecrumb=RoQtzbt66M5lang=en-USregion=USinterval=1dperiod1={cur_date_unix}period2={next_date_unix}events=div%7CsplitcorsDomain=finance.yahoo.com'
            result_p = requests.get(url_petrole, headers={'Referer': 'https://finance.yahoo.com/quote/CL%3DF/history?p=CL%3DF'})
            
            if result_p.json()['chart']['result'][0]['indicators']['adjclose'] != None : 
                
                adj_close_dict = result_p.json()['chart']['result'][0]['indicators']['adjclose'][0]

                if len(adj_close_dict) == 0 :

                    df.loc[i, 'petrole_USD'] = np.nan



                elif len(adj_close_dict['adjclose']) == 1 : 

                    df.loc[i,'petrole_USD'] = adj_close_dict['adjclose'][0]

                else : 

                    length = len(adj_close_dict['adjclose'])

                    df.loc[i,'petrole_USD'] = adj_close_dict['adjclose'][length - 1]
            
            else : 
                    
                df.loc[i, 'petrole_USD'] = np.nan
    
    
    
    
    
    
    
    
    
    
    print()
    print(CPU time for the third sub-module : %s seconds % (time.time() - start_time))
    
    
    
    
    
    
    
    return df   ```

Topic web-scraping forecasting time-series python machine-learning

Category Data Science

Web scraping using Beatiful Soup

About