2018年世界盃賠率預測 -DNN
阿新 • • 發佈:2018-11-03
# -*- coding: utf-8 -*- ''' Created on 2018年7月2日 @author: user @summary: Predicting the winner of the 2018 FIFA World Cup ''' import numpy as np # linear algebra import pandas as pd # data processing import tensorflow as tf from tensorflow.python.data import Dataset from sklearn import metrics from itertools import combinations rankings = pd.read_csv('fifa_ranking.csv') rankings = rankings.loc[:,['rank', 'country_full', 'country_abrv', 'cur_year_avg_weighted', 'rank_date', 'two_year_ago_weighted', 'three_year_ago_weighted']] rankings.country_full.replace("^IR Iran*", "Iran", regex=True, inplace=True) rankings['weighted_points'] = rankings['cur_year_avg_weighted'] + rankings['two_year_ago_weighted'] + rankings['three_year_ago_weighted'] rankings['rank_date'] = pd.to_datetime(rankings['rank_date']) matches = pd.read_csv("results.csv") matches = matches.replace({'Germany DR': 'Germany', 'China': 'China PR'}) matches['date'] = pd.to_datetime(matches['date']) world_cup = pd.read_csv("World Cup 2018 Dataset.csv") world_cup = world_cup.loc[:, ['Team', 'Group', 'First match \nagainst', 'Second match\n against', 'Third match\n against']] world_cup = world_cup.dropna(how='all') world_cup = world_cup.replace({"IRAN": "Iran", "Costarica": "Costa Rica", "Porugal": "Portugal", "Columbia": "Colombia", "Korea" : "Korea Republic"}) world_cup = world_cup.set_index('Team') # Get Complete Date wise Ranking table rankings = rankings.set_index(['rank_date']).groupby(['country_full'],group_keys = False).resample('D').first().fillna(method='ffill').reset_index() #Join Ranking with match matches = matches.merge(rankings,left_on=['date', 'home_team'],right_on=['rank_date', 'country_full']) matches = matches.merge(rankings, left_on=['date', 'away_team'],right_on=['rank_date', 'country_full'], suffixes=('_home', '_away')) # feature generation matches['rank_difference'] = matches['rank_home'] - matches['rank_away'] matches['average_rank'] = (matches['rank_home'] + matches['rank_away'])/2 matches['point_difference'] = matches['weighted_points_home'] - matches['weighted_points_away'] matches['score_difference'] = matches['home_score'] - matches['away_score'] matches['is_won'] = matches['score_difference'] > 0 # take draw as lost matches['is_stake'] = matches['tournament'] != 'Friendly' tf.logging.set_verbosity(tf.logging.ERROR) pd.options.display.max_rows = 10 pd.options.display.float_format = '{:.1f}'.format matches = matches.reindex(np.random.permutation(matches.index)) def preprocess_features(matches): selected_features = matches[["average_rank", "rank_difference", "point_difference", "is_stake"]] processed_features = selected_features.copy() return processed_features def preprocess_targets(matches): output_targets = pd.DataFrame() # Scale the target to be in units of thousands of dollars. output_targets["is_won"] = matches['is_won'] return output_targets # Choose the first 60% i.e 10900 (out of 18167) examples for training. training_examples = preprocess_features(matches.head(10900)) training_targets = preprocess_targets(matches.head(10900)) # Choose the last 40% i.e 7267 (out of 18167) examples for validation. validation_examples = preprocess_features(matches.tail(7267)) validation_targets = preprocess_targets(matches.tail(7267)) Complete_Data_training = preprocess_features(matches) Complete_Data_Validation = preprocess_targets(matches) def construct_feature_columns(input_features): return set([tf.feature_column.numeric_column(my_feature) for my_feature in input_features]) def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None): """Trains a neural network model. Args: features: pandas DataFrame of features targets: pandas DataFrame of targets batch_size: Size of batches to be passed to the model shuffle: True or False. Whether to shuffle the data. num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely Returns: Tuple of (features, labels) for next data batch """ # Convert pandas data into a dict of np arrays. features = {key:np.array(value) for key,value in dict(features).items()} # Construct a dataset, and configure batching/repeating. ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit ds = ds.batch(batch_size).repeat(num_epochs) # Shuffle the data, if specified. if shuffle: ds = ds.shuffle(10000) # Return the next batch of data. features, labels = ds.make_one_shot_iterator().get_next() return features, labels def train_nn_classification_model( my_optimizer, steps, batch_size, hidden_units, training_examples, training_targets, validation_examples, validation_targets): periods = 10 steps_per_period = steps / periods # Create a DNNRegressor object. my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 3.0) dnn_classifier = tf.estimator.DNNClassifier( feature_columns=construct_feature_columns(training_examples), hidden_units=hidden_units, optimizer=my_optimizer) # Create input functions. training_input_fn = lambda: my_input_fn(training_examples, training_targets["is_won"], batch_size=batch_size) predict_training_input_fn = lambda: my_input_fn(training_examples, training_targets["is_won"], num_epochs=1, shuffle=False) predict_validation_input_fn = lambda: my_input_fn(validation_examples, validation_targets["is_won"], num_epochs=1, shuffle=False) # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. print("Training model...") print("LogLoss (on training data):") training_log_losses = [] validation_log_losses = [] for period in range (0, periods): # Train the model, starting from the prior state. dnn_classifier.train( input_fn=training_input_fn, steps=steps_per_period) # Take a break and compute predictions. training_probabilities = dnn_classifier.predict(input_fn=predict_training_input_fn) training_probabilities = np.array([item['probabilities'] for item in training_probabilities]) validation_probabilities = dnn_classifier.predict(input_fn=predict_validation_input_fn) validation_probabilities = np.array([item['probabilities'] for item in validation_probabilities]) training_log_loss = metrics.log_loss(training_targets, training_probabilities) validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities) # Occasionally print the current loss. print(" period %02d : %0.2f" % (period, training_log_loss)) # Add the loss metrics from this period to our list. training_log_losses.append(training_log_loss) validation_log_losses.append(validation_log_loss) print("Model training finished.") # Output a graph of loss metrics over periods. return dnn_classifier linear_classifier = train_nn_classification_model( my_optimizer=tf.train.AdagradOptimizer(learning_rate=0.07), steps=3000, batch_size=2000, hidden_units=[5, 5,6,5], training_examples=training_examples, training_targets=training_targets, validation_examples=validation_examples, validation_targets=validation_targets) predict_validation_input_fn = lambda: my_input_fn(validation_examples, validation_targets["is_won"], num_epochs=1, shuffle=False) validation_probabilities = linear_classifier.predict(input_fn=predict_validation_input_fn) # Get just the probabilities for the positive class. validation_probabilities = np.array([item['probabilities'][1] for item in validation_probabilities]) false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(validation_targets, validation_probabilities) evaluation_metrics = linear_classifier.evaluate(input_fn=predict_validation_input_fn) print("AUC on the validation set: %0.2f" % evaluation_metrics['auc']) print("Accuracy on the validation set: %0.2f" % evaluation_metrics['accuracy']) #World Cup simulation # let's define a small margin when we safer to predict draw then win margin = 0.05 # let's define the rankings at the time of the World Cup world_cup_rankings = rankings.loc[(rankings['rank_date'] == rankings['rank_date'].max()) & rankings['country_full'].isin(world_cup.index.unique())] world_cup_rankings = world_cup_rankings.set_index(['country_full']) opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against'] world_cup['points'] = 0 world_cup['total_prob'] = 0 for group in set(world_cup['Group']): print('___Starting group {}:___'.format(group)) for home, away in combinations(world_cup.query('Group =="{}"'.format(group)).index, 2): print("{} vs. {}: ".format(home, away)) row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=validation_examples.columns) home_rank = world_cup_rankings.loc[home, 'rank'] home_points = world_cup_rankings.loc[home, 'weighted_points'] opp_rank = world_cup_rankings.loc[away, 'rank'] opp_points = world_cup_rankings.loc[away, 'weighted_points'] row['average_rank'] = (home_rank + opp_rank) / 2 row['rank_difference'] = home_rank - opp_rank row['point_difference'] = home_points - opp_points row['is_won'] =np.nan predict_validation_input_fn1 = lambda: my_input_fn(row, row["is_won"], num_epochs=1, shuffle=False) validation_probabilities1 = linear_classifier.predict(input_fn=predict_validation_input_fn1) # Get just the probabilities for the positive class. validation_probabilities1 = np.array([item['probabilities'][1] for item in validation_probabilities1]) #print(validation_probabilities1[0]) home_win_prob = validation_probabilities1[0] world_cup.loc[home, 'total_prob'] += home_win_prob world_cup.loc[away, 'total_prob'] += 1-home_win_prob points = 0 if home_win_prob <= 0.5 - margin: print("{} wins with {:.2f}".format(away, 1-home_win_prob)) world_cup.loc[away, 'points'] += 3 if home_win_prob > 0.5 - margin: points = 1 if home_win_prob >= 0.5 + margin: points = 3 world_cup.loc[home, 'points'] += 3 print("{} wins with {:.2f}".format(home, home_win_prob)) if points == 1: print("Draw") world_cup.loc[home, 'points'] += 1 world_cup.loc[away, 'points'] += 1 pairing = [0,3,4,7,8,11,12,15,1,2,5,6,9,10,13,14] world_cup = world_cup.sort_values(by=['Group', 'points', 'total_prob'], ascending=False).reset_index() next_round_wc = world_cup.groupby('Group').nth([0, 1]) # select the top 2 next_round_wc = next_round_wc.reset_index() next_round_wc = next_round_wc.loc[pairing] next_round_wc = next_round_wc.set_index('Team') finals = ['round_of_16', 'quarterfinal', 'semifinal', 'final'] labels = list() odds = list() for f in finals: print("___Starting of the {}___".format(f)) iterations = int(len(next_round_wc) / 2) winners = [] for i in range(iterations): home = next_round_wc.index[i*2] away = next_round_wc.index[i*2+1] print("{} vs. {}: ".format(home,away)) row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=validation_examples.columns) home_rank = world_cup_rankings.loc[home, 'rank'] home_points = world_cup_rankings.loc[home, 'weighted_points'] opp_rank = world_cup_rankings.loc[away, 'rank'] opp_points = world_cup_rankings.loc[away, 'weighted_points'] row['average_rank'] = (home_rank + opp_rank) / 2 row['rank_difference'] = home_rank - opp_rank row['point_difference'] = home_points - opp_points row['is_won'] =np.nan predict_validation_input_fn1 = lambda: my_input_fn(row, row["is_won"], num_epochs=1, shuffle=False) validation_probabilities1 = linear_classifier.predict(input_fn=predict_validation_input_fn1) # Get just the probabilities for the positive class. validation_probabilities1 = np.array([item['probabilities'][1] for item in validation_probabilities1]) #print(validation_probabilities1[0]) home_win_prob = validation_probabilities1[0] #home_win_prob = model.predict_proba(row)[:,1][0] if home_win_prob <= 0.5: print("{0} wins with probability {1:.2f}".format(away, 1-home_win_prob)) winners.append(away) else: print("{0} wins with probability {1:.2f}".format(home, home_win_prob)) winners.append(home) labels.append("{}({:.2f}) vs. {}({:.2f})".format(world_cup_rankings.loc[home, 'country_abrv'], 1/home_win_prob, world_cup_rankings.loc[away, 'country_abrv'], 1/(1-home_win_prob))) odds.append([home_win_prob, 1-home_win_prob]) next_round_wc = next_round_wc.loc[winners] print("\n")
Model training finished. AUC on the validation set: 0.74 Accuracy on the validation set: 0.67 ___Starting group A:___ Russia vs. Saudi Arabia: Draw Russia vs. Egypt: Egypt wins with 0.67 Russia vs. Uruguay: Uruguay wins with 0.84 Saudi Arabia vs. Egypt: Egypt wins with 0.66 Saudi Arabia vs. Uruguay: Uruguay wins with 0.84 Egypt vs. Uruguay: Uruguay wins with 0.84 ___Starting group C:___ France vs. Australia: France wins with 0.57 France vs. Peru: Draw France vs. Denmark: Draw Australia vs. Peru: Peru wins with 0.84 Australia vs. Denmark: Denmark wins with 0.84 Peru vs. Denmark: Draw ___Starting group B:___ Portugal vs. Spain: Draw Portugal vs. Morocco: Portugal wins with 0.62 Portugal vs. Iran: Portugal wins with 0.62 Spain vs. Morocco: Spain wins with 0.60 Spain vs. Iran: Spain wins with 0.60 Morocco vs. Iran: Draw ___Starting group E:___ Brazil vs. Switzerland: Draw Brazil vs. Costa Rica: Draw Brazil vs. Serbia: Brazil wins with 0.59 Switzerland vs. Costa Rica: Draw Switzerland vs. Serbia: Switzerland wins with 0.57 Costa Rica vs. Serbia: Draw ___Starting group D:___ Argentina vs. Iceland: Draw Argentina vs. Croatia: Draw Argentina vs. Nigeria: Argentina wins with 0.64 Iceland vs. Croatia: Draw Iceland vs. Nigeria: Iceland wins with 0.60 Croatia vs. Nigeria: Croatia wins with 0.60 ___Starting group G:___ Belgium vs. Panama: Belgium wins with 0.68 Belgium vs. Tunisia: Draw Belgium vs. England: Draw Panama vs. Tunisia: Tunisia wins with 0.84 Panama vs. England: England wins with 0.84 Tunisia vs. England: England wins with 0.61 ___Starting group F:___ Germany vs. Mexico: Germany wins with 0.56 Germany vs. Sweden: Germany wins with 0.59 Germany vs. Korea Republic: Germany wins with 0.73 Mexico vs. Sweden: Draw Mexico vs. Korea Republic: Mexico wins with 0.65 Sweden vs. Korea Republic: Sweden wins with 0.64 ___Starting group H:___ Poland vs. Senegal: Draw Poland vs. Colombia: Draw Poland vs. Japan: Poland wins with 0.66 Senegal vs. Colombia: Colombia wins with 0.55 Senegal vs. Japan: Senegal wins with 0.63 Colombia vs. Japan: Colombia wins with 0.65 ___Starting of the round_of_16___ Uruguay vs. Spain: Spain wins with probability 0.54 Denmark vs. Croatia: Denmark wins with probability 0.55 Switzerland vs. Mexico: Mexico wins with probability 0.51 England vs. Poland: Poland wins with probability 0.53 Egypt vs. Portugal: Portugal wins with probability 0.84 Peru vs. Argentina: Argentina wins with probability 0.56 Brazil vs. Germany: Germany wins with probability 0.84 Belgium vs. Colombia: Belgium wins with probability 0.54 ___Starting of the quarterfinal___ Spain vs. Denmark: Denmark wins with probability 0.52 Mexico vs. Poland: Poland wins with probability 0.59 Portugal vs. Argentina: Argentina wins with probability 0.53 Germany vs. Belgium: Belgium wins with probability 0.52 ___Starting of the semifinal___ Denmark vs. Poland: Poland wins with probability 0.51 Argentina vs. Belgium: Belgium wins with probability 0.57 ___Starting of the final___ Poland vs. Belgium: Belgium wins with probability 0.84