Source code for sportsbet.datasets._soccer._fd

"""
Download and transform historical and fixtures data
for various leagues from Football-Data.co.uk.

Football-Data.co.uk: http://www.football-data.co.uk/data.php
"""

# Author: Georgios Douzas <gdouzas@icloud.com>
# License: MIT

from urllib.request import urlopen, urljoin
from datetime import datetime
from os.path import join
from functools import lru_cache

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from rich.progress import track
from sklearn.model_selection import ParameterGrid

from . import OUTCOMES
from .._base import _BaseDataLoader, _read_csv

URL = 'http://www.football-data.co.uk'
BASE_URLS = [
    'englandm.php',
    'scotlandm.php',
    'germanym.php',
    'italym.php',
    'spainm.php',
    'francem.php',
    'netherlandsm.php',
    'belgiumm.php',
    'portugalm.php',
    'turkeym.php',
    'greecem.php',
    'Argentina.php',
    'Austria.php',
    'Brazil.php',
    'China.php',
    'Denmark.php',
    'Finland.php',
    'Ireland.php',
    'Japan.php',
    'Mexico.php',
    'Norway.php',
    'Poland.php',
    'Romania.php',
    'Russia.php',
    'Sweden.php',
    'Switzerland.php',
    'USA.php',
]
LEAGUES_MAPPING = {
    'England': ('E', '0', '1', '2', '3', 'C'),
    'Scotland': ('SC', '0', '1', '2', '3', 'C'),
    'Germany': ('D', '1', '2'),
    'Italy': ('I', '1', '2'),
    'Spain': ('SP', '1', '2'),
    'France': ('F', '1', '2'),
    'Netherlands': ('N', '1'),
    'Belgium': ('B', '1'),
    'Portugal': ('P', '1'),
    'Turkey': ('T', '1'),
    'Greece': ('G', '1'),
    'Argentina': ('ARG', '1'),
    'Austria': ('AUT', '1'),
    'Brazil': ('BRA', '1'),
    'China': ('CHN', '1'),
    'Denmark': ('DNK', '1'),
    'Finland': ('FIN', '1'),
    'Ireland': ('IRL', '1'),
    'Japan': ('JPN', '1'),
    'Mexico': ('MEX', '1'),
    'Norway': ('NOR', '1'),
    'Poland': ('POL', '1'),
    'Romania': ('ROU', '1'),
    'Russia': ('RUS', '1'),
    'Sweden': ('SWE', '1'),
    'Switzerland': ('SWZ', '1'),
    'USA': ('USA', '1'),
}


def _convert_base_url_to_league(base_url):
    league = base_url.replace('.php', '')
    if base_url[0].islower():
        league = league[:-1].capitalize()
    return league


def _extract_csv_urls(base_url):
    html = urlopen(urljoin(URL, base_url))
    bsObj = BeautifulSoup(html.read(), features='html.parser')
    return {
        el.get('href') for el in bsObj.find_all('a') if el.get('href').endswith('csv')
    }


def _param_grid_to_csv_urls(param_grid):
    urls = []
    for params in param_grid:
        in_main_leagues = f'{params["league"].lower()}m.php' in BASE_URLS
        encoded_league, *divisions = LEAGUES_MAPPING[params['league']]
        if in_main_leagues:
            year = f'{str(params["year"] - 1)[2:]}{str(params["year"])[2:]}'
            if '0' in divisions:
                division = (
                    str(params['division'] - 1) if params['division'] != 5 else 'C'
                )
            else:
                division = str(params['division'])
            urls.append(
                (params, join(URL, 'mmz4281', year, f'{encoded_league}{division}.csv'))
            )
        else:
            urls.append((params, join(URL, 'new', f'{encoded_league}.csv')))
    return urls


[docs]class FDSoccerDataLoader(_BaseDataLoader): """Dataloader for Football-Data.co.uk soccer data. Read more in the :ref:`user guide <user_guide>`. Parameters ---------- param_grid : dict of str to sequence, or sequence of such parameter, default=None The parameter grid to explore, as a dictionary mapping data parameters to sequences of allowed values. An empty dict signifies default parameters. A sequence of dicts signifies a sequence of grids to search, and is useful to avoid exploring parameter combinations that do not exist. The default value corresponds to all parameters. Examples -------- >>> """ _removed_cols = [ 'Div', 'Country', 'Season', 'Time', 'FTR', 'Res', 'Attendance', 'Referee', 'HTR', 'BbAH', 'Bb1X2', 'BbOU', 'League', 'divisions', ] _cols_mapping = { 'HT': 'home_team', 'Home': 'home_team', 'AT': 'away_team', 'Away': 'away_team', 'LB': 'ladbrokes__home_win__odds', 'LB.1': 'ladbrokes__draw__odds', 'LB.2': 'ladbrokes__away_win__odds', 'PH': 'pinnacle__home_win__odds', 'PD': 'pinnacle__draw__odds', 'PA': 'pinnacle__away_win__odds', 'HG': 'home_team__full_time_goals', 'AG': 'away_team__full_time_goals', 'HomeTeam': 'home_team', 'AwayTeam': 'away_team', 'Date': 'date', 'B365AH': 'bet365__size_of_asian_handicap_home_team__odds', 'LBAH': 'ladbrokes__size_of_asian_handicap_home_team__odds', 'BbAHh': 'betbrain__size_of_asian_handicap_home_team__odds', 'GBAH': 'gamebookers__size_of_handicap_home_team__odds', 'AHh': 'market_average__size_of_handicap_home_team__odds', 'AHCh': 'market_average_closing__size_of_asian_handicap_home_team__odds', 'B365H': 'bet365__home_win__odds', 'B365D': 'bet365__draw__odds', 'B365A': 'bet365__away_win__odds', 'B365>2.5': 'bet365__over_2.5__odds', 'B365<2.5': 'bet365__under_2.5__odds', 'B365AHH': 'bet365__asian_handicap_home_team__odds', 'B365AHA': 'bet365__asian_handicap_away_team__odds', 'B365CH': 'bet365_closing__home_win__odds', 'B365CD': 'bet365_closing__draw__odds', 'B365CA': 'bet365_closing__away_win__odds', 'B365C>2.5': 'bet365_closing__over_2.5__odds', 'B365C<2.5': 'bet365_closing__under_2.5__odds', 'B365CAHH': 'bet365_closing__asian_handicap_home_team__odds', 'B365CAHA': 'bet365_closing__asian_handicap_away_team__odds', 'BbMxH': 'betbrain_maximum__home_win__odds', 'BbMxD': 'betbrain_maximum__draw__odds', 'BbMxA': 'betbrain_maximum__away_win__odds', 'BbMx>2.5': 'betbrain_maximum__over_2.5__odds', 'BbMx<2.5': 'betbrain_maximum__under_2.5__odds', 'BbMxAHH': 'betbrain_maximum__asian_handicap_home_team__odds', 'BbMxAHA': 'betbrain_maximum__asian_handicap_away_team__odds', 'BbAvH': 'betbrain_average__home_win__odds', 'BbAvD': 'betbrain_average__draw_win__odds', 'BbAvA': 'betbrain_average__away_win__odds', 'BbAv>2.5': 'betbrain_average__over_2.5__odds', 'BbAv<2.5': 'betbrain_average__under_2.5__odds', 'BbAvAHH': 'betbrain_average__asian_handicap_home_team__odds', 'BbAvAHA': 'betbrain_average__asian_handicap_away_team__odds', 'BWH': 'betwin__home_win__odds', 'BWD': 'betwin__draw__odds', 'BWA': 'betwin__away_win__odds', 'BWCH': 'betwin_closing__home_win__odds', 'BWCD': 'betwin_closing__draw__odds', 'BWCA': 'betwin_closing__away_win__odds', 'BSH': 'bluesquare__home_win__odds', 'BSD': 'bluesquare__draw__odds', 'BSA': 'bluesquare__away_win__odds', 'GBH': 'gamebookers__home_win__odds', 'GBD': 'gamebookers__draw__odds', 'GBA': 'gamebookers__away_win__odds', 'GB>2.5': 'gamebookers__over_2.5__odds', 'GB<2.5': 'gamebookers__under_2.5__odds', 'GBAHH': 'gamebookers__asian_handicap_home_team__odds', 'GBAHA': 'gamebookers__asian_handicap_away_team__odds', 'IWH': 'interwetten__home_win__odds', 'IWD': 'interwetten__draw__odds', 'IWA': 'interwetten__away_win__odds', 'IWCH': 'interwetten_closing__home_win__odds', 'IWCD': 'interwetten_closing__draw__odds', 'IWCA': 'interwetten_closing__away_win__odds', 'LBH': 'ladbrokes__home_win__odds', 'LBD': 'ladbrokes__draw__odds', 'LBA': 'ladbrokes__away_win__odds', 'LBAHH': 'ladbrokes__asian_handicap_home_team__odds', 'LBAHA': 'ladbrokes__asian_handicap_away_team__odds', 'PSH': 'pinnacle__home_win__odds', 'PSD': 'pinnacle__draw__odds', 'PSA': 'pinnacle__away_win__odds', 'P>2.5': 'pinnacle__over_2.5__odds', 'P<2.5': 'pinnacle__under_2.5__odds', 'PAHH': 'pinnacle__asian_handicap_home_team__odds', 'PAHA': 'pinnacle__asian_handicap_away_team__odds', 'PSCH': 'pinnacle_closing__home_win__odds', 'PSCD': 'pinnacle_closing__draw__odds', 'PSCA': 'pinnacle_closing__away_win__odds', 'PC>2.5': 'pinnacle_closing__over_2.5__odds', 'PC<2.5': 'pinnacle_closing__under_2.5__odds', 'PCAHH': 'pinnacle_closing__asian_handicap_home_team__odds', 'PCAHA': 'pinnacle_closing__asian_handicap_away_team__odds', 'SOH': 'sporting__home_win__odds', 'SOD': 'sporting__draw__odds', 'SOA': 'sporting__away_win__odds', 'SBH': 'sportingbet__home_win__odds', 'SBD': 'sportingbet__draw__odds', 'SBA': 'sportingbet__away_win__odds', 'SJH': 'stanjames__home_win__odds', 'SJD': 'stanjames__draw__odds', 'SJA': 'stanjames__away_win__odds', 'SYH': 'stanleybet__home_win__odds', 'SYD': 'stanleybet__draw__odds', 'SYA': 'stanleybet__away_win__odds', 'VCH': 'vcbet__home_win__odds', 'VCD': 'vcbet__draw__odds', 'VCA': 'vcbet__away_win__odds', 'VCCH': 'vcbet_closing__home_win__odds', 'VCCD': 'vcbet_closing__draw__odds', 'VCCA': 'vcbet_closing__away_win__odds', 'WHH': 'williamhill__home_win__odds', 'WHD': 'williamhill__draw__odds', 'WHA': 'williamhill__away_win__odds', 'WHCH': 'williamhill_closing__home_win__odds', 'WHCD': 'williamhill_closing__draw__odds', 'WHCA': 'williamhill_closing__away_win__odds', 'MaxH': 'market_maximum__home_win__odds', 'MaxD': 'market_maximum__draw__odds', 'MaxA': 'market_maximum__away_win__odds', 'Max>2.5': 'market_maximum__over_2.5__odds', 'Max<2.5': 'market_maximum__under_2.5__odds', 'MaxAHH': 'market_maximum__asian_handicap_home_team__odds', 'MaxAHA': 'market_maximum__asian_handicap_away_team__odds', 'MaxCH': 'market_maximum_closing__home_win__odds', 'MaxCD': 'market_maximum_closing__draw__odds', 'MaxCA': 'market_maximum_closing__away_win__odds', 'MaxC>2.5': 'market_maximum_closing__over_2.5__odds', 'MaxC<2.5': 'market_maximum_closing__under_2.5__odds', 'MaxCAHH': 'market_maximum_closing__asian_handicap_home_team__odds', 'MaxCAHA': 'market_maximum_closing__asian_handicap_away_team__odds', 'AvgH': 'market_average__home_win__odds', 'AvgD': 'market_average__draw__odds', 'AvgA': 'market_average__away_win__odds', 'Avg>2.5': 'market_average__over_2.5__odds', 'Avg<2.5': 'market_average__under_2.5__odds', 'AvgAHH': 'market_average__asian_handicap_home_team__odds', 'AvgAHA': 'market_average__asian_handicap_away_team__odds', 'AvgCH': 'market_average_closing__home_win__odds', 'AvgCD': 'market_average_closing__draw__odds', 'AvgCA': 'market_average_closing__away_win__odds', 'AvgC>2.5': 'market_average_closing__over_2.5__odds', 'AvgC<2.5': 'market_average_closing__under_2.5__odds', 'AvgCAHH': 'market_average_closing__asian_handicap_home_team__odds', 'AvgCAHA': 'market_average_closing__asian_handicap_away_team__odds', 'FTHG': 'home_team__full_time_goals', 'FTAG': 'away_team__full_time_goals', 'HTHG': 'home_team__half_time_goals', 'HTAG': 'away_team__half_time_goals', 'HS': 'home_team__shots', 'AS': 'away_team__shots', 'HST': 'home_team__shots_on_target', 'AST': 'away_team__shots_on_target', 'HHW': 'home_team__hit_woodork', 'AHW': 'away_team__hit_woodork', 'HC': 'home_team__corners', 'AC': 'away_team__corners', 'HF': 'home_team__fouls_committed', 'AF': 'away_team__fouls_committed', 'HFKC': 'home_team__free_kicks_conceded', 'AFKC': 'away_team__free_kicks_conceded', 'HO': 'home_team__offsides', 'AO': 'away_team__offsides', 'HY': 'home_team__yellow_cards', 'AY': 'away_team__yellow_cards', 'HR': 'home_team__red_cards', 'AR': 'away_team__red_cards', 'HBP': 'home_team__bookings_points', 'ABP': 'away_team__bookings_points', } @classmethod def _get_schema(cls): return [ ('league', object), ('division', int), ('year', int), ('home_team', object), ('away_team', object), ('date', np.datetime64), ('bet365__size_of_asian_handicap_home_team__odds', object), ('ladbrokes__size_of_asian_handicap_home_team__odds', object), ('betbrain__size_of_asian_handicap_home_team__odds', object), ('gamebookers__size_of_handicap_home_team__odds', object), ('market_average__size_of_handicap_home_team__odds', object), ('market_average_closing__size_of_asian_handicap_home_team__odds', object), ('bet365__home_win__odds', float), ('bet365__draw__odds', float), ('bet365__away_win__odds', float), ('bet365__over_2.5__odds', float), ('bet365__under_2.5__odds', float), ('bet365__asian_handicap_home_team__odds', float), ('bet365__asian_handicap_away_team__odds', float), ('bet365_closing__home_win__odds', float), ('bet365_closing__draw__odds', float), ('bet365_closing__away_win__odds', float), ('bet365_closing__over_2.5__odds', float), ('bet365_closing__under_2.5__odds', float), ('bet365_closing__asian_handicap_home_team__odds', float), ('bet365_closing__asian_handicap_away_team__odds', float), ('betbrain_maximum__home_win__odds', float), ('betbrain_maximum__draw__odds', float), ('betbrain_maximum__away_win__odds', float), ('betbrain_maximum__over_2.5__odds', float), ('betbrain_maximum__under_2.5__odds', float), ('betbrain_maximum__asian_handicap_home_team__odds', float), ('betbrain_maximum__asian_handicap_away_team__odds', float), ('betbrain_average__home_win__odds', float), ('betbrain_average__draw_win__odds', float), ('betbrain_average__away_win__odds', float), ('betbrain_average__over_2.5__odds', float), ('betbrain_average__under_2.5__odds', float), ('betbrain_average__asian_handicap_home_team__odds', float), ('betbrain_average__asian_handicap_away_team__odds', float), ('betwin__home_win__odds', float), ('betwin__draw__odds', float), ('betwin__away_win__odds', float), ('betwin_closing__home_win__odds', float), ('betwin_closing__draw__odds', float), ('betwin_closing__away_win__odds', float), ('bluesquare__home_win__odds', float), ('bluesquare__draw__odds', float), ('bluesquare__away_win__odds', float), ('gamebookers__home_win__odds', float), ('gamebookers__draw__odds', float), ('gamebookers__away_win__odds', float), ('gamebookers__over_2.5__odds', float), ('gamebookers__under_2.5__odds', float), ('gamebookers__asian_handicap_home_team__odds', float), ('gamebookers__asian_handicap_away_team__odds', float), ('interwetten__home_win__odds', float), ('interwetten__draw__odds', float), ('interwetten__away_win__odds', float), ('interwetten_closing__home_win__odds', float), ('interwetten_closing__draw__odds', float), ('interwetten_closing__away_win__odds', float), ('ladbrokes__home_win__odds', float), ('ladbrokes__draw__odds', float), ('ladbrokes__away_win__odds', float), ('ladbrokes__asian_handicap_home_team__odds', float), ('ladbrokes__asian_handicap_away_team__odds', float), ('pinnacle__home_win__odds', float), ('pinnacle__draw__odds', float), ('pinnacle__away_win__odds', float), ('pinnacle__over_2.5__odds', float), ('pinnacle__under_2.5__odds', float), ('pinnacle__asian_handicap_home_team__odds', float), ('pinnacle__asian_handicap_away_team__odds', float), ('pinnacle_closing__home_win__odds', float), ('pinnacle_closing__draw__odds', float), ('pinnacle_closing__away_win__odds', float), ('pinnacle_closing__over_2.5__odds', float), ('pinnacle_closing__under_2.5__odds', float), ('pinnacle_closing__asian_handicap_home_team__odds', float), ('pinnacle_closing__asian_handicap_away_team__odds', float), ('sporting__home_win__odds', float), ('sporting__draw__odds', float), ('sporting__away_win__odds', float), ('sportingbet__home_win__odds', float), ('sportingbet__draw__odds', float), ('sportingbet__away_win__odds', float), ('stanjames__home_win__odds', float), ('stanjames__draw__odds', float), ('stanjames__away_win__odds', float), ('stanleybet__home_win__odds', float), ('stanleybet__draw__odds', float), ('stanleybet__away_win__odds', float), ('vcbet__home_win__odds', float), ('vcbet__draw__odds', float), ('vcbet__away_win__odds', float), ('vcbet_closing__home_win__odds', float), ('vcbet_closing__draw__odds', float), ('vcbet_closing__away_win__odds', float), ('williamhill__home_win__odds', float), ('williamhill__draw__odds', float), ('williamhill__away_win__odds', float), ('williamhill_closing__home_win__odds', float), ('williamhill_closing__draw__odds', float), ('williamhill_closing__away_win__odds', float), ('market_maximum__home_win__odds', float), ('market_maximum__draw__odds', float), ('market_maximum__away_win__odds', float), ('market_maximum__over_2.5__odds', float), ('market_maximum__under_2.5__odds', float), ('market_maximum__asian_handicap_home_team__odds', float), ('market_maximum__asian_handicap_away_team__odds', float), ('market_maximum_closing__home_win__odds', float), ('market_maximum_closing__draw__odds', float), ('market_maximum_closing__away_win__odds', float), ('market_maximum_closing__over_2.5__odds', float), ('market_maximum_closing__under_2.5__odds', float), ('market_maximum_closing__asian_handicap_home_team__odds', float), ('market_maximum_closing__asian_handicap_away_team__odds', float), ('market_average__home_win__odds', float), ('market_average__draw__odds', float), ('market_average__away_win__odds', float), ('market_average__over_2.5__odds', float), ('market_average__under_2.5__odds', float), ('market_average__asian_handicap_home_team__odds', float), ('market_average__asian_handicap_away_team__odds', float), ('market_average_closing__home_win__odds', float), ('market_average_closing__draw__odds', float), ('market_average_closing__away_win__odds', float), ('market_average_closing__over_2.5__odds', float), ('market_average_closing__under_2.5__odds', float), ('market_average_closing__asian_handicap_home_team__odds', float), ('market_average_closing__asian_handicap_away_team__odds', float), ('home_team__full_time_goals', int), ('away_team__full_time_goals', int), ('home_team__half_time_goals', int), ('away_team__half_time_goals', int), ('home_team__shots', int), ('away_team__shots', int), ('home_team__shots_on_target', int), ('away_team__shots_on_target', int), ('home_team__hit_woodork', int), ('away_team__hit_woodork', int), ('home_team__corners', int), ('away_team__corners', int), ('home_team__fouls_committed', int), ('away_team__fouls_committed', int), ('home_team__free_kicks_conceded', int), ('away_team__free_kicks_conceded', int), ('home_team__offsides', int), ('away_team__offsides', int), ('home_team__yellow_cards', int), ('away_team__yellow_cards', int), ('home_team__red_cards', int), ('away_team__red_cards', int), ('home_team__bookings_points', float), ('away_team__bookings_points', float), ] @classmethod def _get_outcomes(cls): return OUTCOMES @classmethod @lru_cache def _get_params(cls): full_param_grid = [] for base_url in BASE_URLS: league = _convert_base_url_to_league(base_url) divisions = LEAGUES_MAPPING[league][1:] urls = _extract_csv_urls(base_url) for url in urls: if base_url[0].islower(): _, year, division = url.split('/') year = datetime.strptime(year[2:], '%y').year division = division.replace('.csv', '')[-1] param_grid = { 'league': [league], 'division': [ int(division) + int('0' in divisions) if division != 'C' else 5 ], 'year': [year], } else: years = _read_csv(urljoin(URL, url), parse_dates='Date')['Season'] years = list( { season + 1 if type(season) is not str else int(season.split('/')[-1]) for season in years.unique() } ) param_grid = {'league': [league], 'division': [1], 'year': years} full_param_grid.append(param_grid) return ParameterGrid(full_param_grid) @lru_cache def _get_data(self): # Training data data_container = [] urls = _param_grid_to_csv_urls(self.param_grid_) for params, url in track(urls, description='Football-Data.co.uk:'): data = _read_csv(url, parse_dates='Date').replace('#REF!', np.nan) if url.split('/')[-2] != 'new': data = data.assign( league=params['league'], division=params['division'], year=params['year'], fixtures=False, ) else: data = data.assign( league=params['league'], division=params['division'], fixtures=False ) data['year'] = data['Season'].apply( lambda season: season + 1 if type(season) is not str else int(season.split('/')[-1]) ) data = data[data.year == params['year']] data = data.drop( columns=[ col for col in data.columns if 'Unnamed' in col or col in self._removed_cols ], ).rename(columns=self._cols_mapping) data_container.append(data) # Fixtures data data = _read_csv(join(URL, 'fixtures.csv'), parse_dates='Date') data['fixtures'] = True inv_leagues_mapping = {v[0]: k for k, v in LEAGUES_MAPPING.items()} data['league'] = data['Div'].apply(lambda div: inv_leagues_mapping[div[:-1]]) data['division'] = data['Div'].apply(lambda div: div[-1]) data['divisions'] = data['league'].apply( lambda league: LEAGUES_MAPPING[league][1:] ) data['division'] = ( data[['division', 'divisions']] .apply( lambda row: row[0] if 'C' not in row[1] else (row[0] - 1 if isinstance(row[0], int) else 4), axis=1, ) .astype(int) ) years = ( pd.DataFrame(self.get_all_params()).groupby(['league', 'division']).max() ).reset_index() data = pd.merge(data, years, how='left') data = data.drop( columns=[ col for col in data.columns if 'Unnamed' in col or col in self._removed_cols ] ).rename(columns=self._cols_mapping) data_container.append(data) # Combine data data = pd.concat(data_container, ignore_index=True) return data.sort_values(['league', 'division', 'year'], ignore_index=True)