1. 程式人生 > >Google機器學習 ----驗證

Google機器學習 ----驗證

  • 使用多個特徵而非單個特徵來進一步提高模型的有效性
  • 除錯模型輸入資料中的問題
  • 使用測試資料集檢查模型是否過擬合驗證資料
#初始化,讀取資料
from __future__ import print_function
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

california_housing_dataframe = pd.read_csv("https://download.mlcc.google.cn/mledu-datasets/california_housing_train.csv", sep=",")
#預處理特徵
def preprocess_features(california_housing_dataframe):
  selected_features = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
  processed_features = selected_features.copy()
  # Create a synthetic feature.
  processed_features["rooms_per_person"] = (
    california_housing_dataframe["total_rooms"] /
    california_housing_dataframe["population"])
  return processed_features

def preprocess_targets(california_housing_dataframe):
  output_targets = pd.DataFrame()
  # Scale the target to be in units of thousands of dollars.
  output_targets["median_house_value"] = (
    california_housing_dataframe["median_house_value"] / 1000.0)
  return output_targets
#特徵處理前12000
training_examples = preprocess_features(california_housing_dataframe.head(12000))
training_examples.describe()

 

 

training_targets = preprocess_targets(california_housing_dataframe.head(12000))
training_targets.describe()

#抽取後5000個樣本
validation_examples = preprocess_features(california_housing_dataframe.tail(5000))
validation_examples.describe()

validation_targets = preprocess_targets(california_housing_dataframe.tail(5000))
validation_targets.describe()

plt.figure(figsize=(13,8))#建立影象
ax=plt.subplot(1,2,1)#建立子圖
ax.set_autoscaley_on(False)
ax.set_ylim([32,43])#y軸上下限
ax.set_autoscaley_on(False)
ax.set_xlim([-126,-112])
plt.scatter(validation_examples["longitude"],
            validation_examples["latitude"],cmap=cm.coolwarm,
            c=validation_targets["median_house_value"]/validation_targets["median_house_value"].max())
ax=plt.subplot(1,2,2)
ax.set_autoscaley_on(False)
ax.set_ylim([32,43])
ax.set_autoscaley_on(False)
ax.set_xlim([-126,-112])
plt.scatter(training_examples["longitude"],
            training_examples["latitude"],cmap=cm.coolwarm,
            c=training_targets["median_house_value"]/training_targets["median_house_value"].max())
_=plt.plot()

 我們會發現訓練集和驗證集的分佈分隔成了兩部分,就像是把加州的地圖給剪開了一樣,這是因為我們在建立訓練集和驗證集之前沒有對資料進行正確的隨機化處理。

def my_input_fn(features,targets,batch_size=1,shuffle=True,num_epochs=None):
    features={key:np.array(value) for key,value in dict(features).items()}
    ds=Dataset.from_tensor_slices((features,targets))
    ds=ds.batch(batch_size).repeat(num_epochs)
    if shuffle:
        ds.shuffle(10000)
    features,labels=ds.make_one_shot_iterator().get_next()
    return features,labels

def construct_feature_columns(input_features):
    return set([tf.feature_column.numeric_column(my_feature) #生成的特徵列
                for my_feature in input_features])
#訓練模型
def train_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
    periods = 10#週期數
    steps_per_period = steps / periods#每個週期的步數
      
    # Create a linear regressor object.
#建立一個線性迴歸
    my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
#配置線性迴歸模型
    linear_regressor = tf.estimator.LinearRegressor(
    feature_columns=construct_feature_columns(training_examples),
          optimizer=my_optimizer
      )
      # Create input functions.
    training_input_fn = lambda: my_input_fn(
    training_examples, 
    training_targets["median_house_value"], 
    batch_size=batch_size)
    predict_training_input_fn = lambda: my_input_fn(
    training_examples, 
    training_targets["median_house_value"], 
          num_epochs=1, 
          shuffle=False)
    predict_validation_input_fn = lambda: my_input_fn(
    validation_examples, validation_targets["median_house_value"], 
          num_epochs=1, 
          shuffle=False)
    
      # Train the model, but do so inside a loop so that we can periodically assess
      # loss metrics.
    print("Training model...")
    print("RMSE (on training data):")
    training_rmse = []
    validation_rmse = []
    for period in range (0, periods):
#繼續訓練模型,從上一次停止位置繼續(停下來輸出訓練過程中的資料)
        # Train the model, starting from the prior state.
        linear_regressor.train(
            input_fn=training_input_fn,
            steps=steps_per_period,
        )
        # Take a break and compute predictions.
        training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
        training_predictions = np.array([item['predictions'][0] for item in training_predictions])  
        validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
        validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
        # Compute training and validation loss.
        training_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(training_predictions, training_targets))
        validation_root_mean_squared_error = math.sqrt(
            metrics.mean_squared_error(validation_predictions, validation_targets))
        # Occasionally print the current loss.
##列印當前計算的損失
        print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
        # Add the loss metrics from this period to our list.
        training_rmse.append(training_root_mean_squared_error)
        validation_rmse.append(validation_root_mean_squared_error)
    print("Model training finished.")
    
    # Output a graph of loss metrics over periods.
    plt.ylabel("RMSE")
    plt.xlabel("Periods")
    plt.title("Root Mean Squared Error vs. Periods")
    plt.tight_layout()
    plt.plot(training_rmse, label="training")
    plt.plot(validation_rmse, label="validation")
    plt.legend()
    return linear_regressor
      
        
#訓練模型
linear_regressor = train_model(
    learning_rate=0.00003,
    steps=500,
    batch_size=5,
    training_examples=training_examples,
    training_targets=training_targets,
    validation_examples=validation_examples,
    validation_targets=validation_targets)

#基於測試資料進行評估
#已對驗證資料進行大量迭代,接下來確保沒有過擬合該特定樣本集特性
california_housing_test_data = pd.read_csv("https://download.mlcc.google.cn/mledu-datasets/california_housing_test.csv", sep=",")
test_examples=preprocess_features(california_housing_test_data)
test_targets=preprocess_targets(california_housing_test_data)
predict_test_input_fn=lambda:my_input_fn(
    test_examples,test_targets["median_house_value"],
    num_epochs=1,shuffle=False
)
test_predictions=linear_regressor.predict(input_fn=predict_test_input_fn)
test_predictions=np.array([item['predictions'][0] for item in test_predictions])
root_mean_squared_error=math.sqrt(metrics.mean_squared_error(test_predictions,test_targets))
print("Final RMSE (on test data):%0.2f"% root_mean_squared_error)