1. 程式人生 > >Google機器學習 特徵集

Google機器學習 特徵集

學習目標:建立一個包含極少特徵但效果與更復雜的特徵集一樣出色的集合

具有較少特徵的模型會使用較少的資源,並且更易於維護。我們來看看能否構建這樣一種模型:包含極少的住房特徵,但效果與使用資料集中所有特徵的模型一樣出色。

from __future__ import print_function
import  math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.cn/mledu-datasets/california_housing_train.csv", sep=",")
california_housing_dataframe=california_housing_dataframe.reindex(
    np.random.permutation(california_housing_dataframe.index)
)
def preprocess_features(california_housing_dataframe):
    selected_features=california_housing_dataframe[[
        "latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"
    ]]
    processed_features=selected_features.copy()
    processed_features["room_per_person"]=(
        california_housing_dataframe["total_rooms"]/
        california_housing_dataframe["population"]
    )
    return processed_features
def preprocess_targets(california_housing_dataframe):
    output_targets=pd.DataFrame()
    output_targets["median_house_value"]=(
        california_housing_dataframe["median_house_value"]/1000.0
    )
    return output_targets
training_examples=preprocess_features(california_housing_dataframe.head(12000))
training_targets=preprocess_targets(california_housing_dataframe.head(12000))
validation_examples=preprocess_features(california_housing_dataframe.tail(5000))
validation_targets=preprocess_targets(california_housing_dataframe.tail(5000))
print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

 

#相關矩陣展現了兩兩比較的相關性,既包括每個特徵與目標特徵之間的比較,也包括每個特徵與其他特徵之間的比較。
#在這裡,相關性被定義為皮爾遜相關係數。 -1負相關 0 不相關 1 正相關
correlation_dataframe=training_examples.copy()
correlation_dataframe["target"]=training_targets["median_house_value"]
correlation_dataframe.corr()

 

def construct_feature_columns(input_features):
    return set([tf.feature_column.numeric_column(my_feature) for my_feature in input_features])

def my_input_fn(features,targets,batch_size=1,shuffle=True,num_epochs=None):
    features={key:np.array(value) for key,value in dict(features).items()}
    ds=Dataset.from_tensor_slices((features,targets))#它的作用是切分傳入Tensor的第一個維度,生成相應的dataset。
    ds=ds.batch(batch_size).repeat(num_epochs)
    if shuffle:
        ds=ds.shuffle(10000)
    features,labels=ds.make_one_shot_iterator().get_next()
    return features,labels



def train_model(
    learning_rate,
    steps,
    batch_size,
    training_examples,
    training_targets,
    validation_examples,
    validation_targets):
  periods = 10
  steps_per_period = steps / periods

  # Create a linear regressor object.
  my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
  my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
  linear_regressor = tf.estimator.LinearRegressor(
      feature_columns=construct_feature_columns(training_examples),
      optimizer=my_optimizer
  )
    
  # Create input functions.
  training_input_fn = lambda: my_input_fn(training_examples, 
                                          training_targets["median_house_value"], 
                                          batch_size=batch_size)
  predict_training_input_fn = lambda: my_input_fn(training_examples, 
                                                  training_targets["median_house_value"], 
                                                  num_epochs=1, 
                                                  shuffle=False)
  predict_validation_input_fn = lambda: my_input_fn(validation_examples, 
                                                    validation_targets["median_house_value"], 
                                                    num_epochs=1, 
                                                    shuffle=False)

  # Train the model, but do so inside a loop so that we can periodically assess
  # loss metrics.
  print("Training model...")
  print("RMSE (on training data):")
  training_rmse = []
  validation_rmse = []
  for period in range (0, periods):
    # Train the model, starting from the prior state.
    linear_regressor.train(
        input_fn=training_input_fn,
        steps=steps_per_period,
    )
    # Take a break and compute predictions.
    training_predictions = linear_regressor.predict(input_fn=predict_training_input_fn)
    training_predictions = np.array([item['predictions'][0] for item in training_predictions])
    
    validation_predictions = linear_regressor.predict(input_fn=predict_validation_input_fn)
    validation_predictions = np.array([item['predictions'][0] for item in validation_predictions])
    
    # Compute training and validation loss.
    training_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(training_predictions, training_targets))
    validation_root_mean_squared_error = math.sqrt(
        metrics.mean_squared_error(validation_predictions, validation_targets))
    # Occasionally print the current loss.
    print("  period %02d : %0.2f" % (period, training_root_mean_squared_error))
    # Add the loss metrics from this period to our list.
    training_rmse.append(training_root_mean_squared_error)
    validation_rmse.append(validation_root_mean_squared_error)
  print("Model training finished.")

  
  # Output a graph of loss metrics over periods.
  plt.ylabel("RMSE")
  plt.xlabel("Periods")
  plt.title("Root Mean Squared Error vs. Periods")
  plt.tight_layout()
  plt.plot(training_rmse, label="training")
  plt.plot(validation_rmse, label="validation")
  plt.legend()

  return linear_regressor

 

#花 5 分鐘時間來搜尋一組效果良好的特徵和訓練引數。然後檢視解決方案,看看我們選擇了哪些引數。請謹記,不同的特徵可能需要不同的學習引數。
minimal_features = [
  "median_income",
  "latitude",
]

minimal_training_examples = training_examples[minimal_features]
minimal_validation_examples = validation_examples[minimal_features]

_ = train_model(
    learning_rate=0.01,
    steps=500,
    batch_size=5,
    training_examples=minimal_training_examples,
    training_targets=training_targets,
    validation_examples=minimal_validation_examples,
    validation_targets=validation_targets)

 

plt.scatter(training_examples["latitude"],training_examples["median_income"])
#兩者確實不存線上性關係。
#有幾個峰值與洛杉磯和舊金山大致相對應。

#zip() 函式用於將可迭代的物件作為引數,將物件中對應的元素打包成一個個元組,然後返回由這些元組組成的列表。
#嘗試建立一些能夠更好地利用緯度的合成特徵。
#例如,您可以建立某個特徵,將 latitude 對映到值 |latitude - 38|,並將該特徵命名為 #distance_from_san_francisco。
#或者,您可以將該空間分成 10 個不同的分桶(例如 latitude_32_to_33、latitude_33_to_34 等):如#果 latitude 位於相應分桶範圍內,則顯示值 1.0;如果不在範圍內,則顯示值 0.0。
#使用相關矩陣來指導您構建合成特徵;如果您發現效果還不錯的合成特徵,可以將其新增到您的模型中。
#您可以獲得的最佳驗證效果是什麼?
#對緯度進行分桶
LATITUDE_RANGES = zip(range(32, 44), range(33, 45))
OPPO = zip(range(32,44),range(33,45))
def select_and_transform_features(source_df):
  selected_examples = pd.DataFrame()
  selected_examples["median_income"] = source_df["median_income"]
  for r in LATITUDE_RANGES:
    selected_examples["latitude_%d_to_%d" % r] = source_df["latitude"].apply(
      lambda l: 1.0 if l >= r[0] and l < r[1] else 0.0)
  return selected_examples
def select_and_transform_featurestwo(source_df):
    selected_examples = pd.DataFrame()
    selected_examples["median_income"] = source_df["median_income"]
    for r in OPPO:
        selected_examples["latitude_%d_to_%d" % r] = source_df["latitude"].apply(lambda l:1.0 if l >= r[0] and l < r[1] else 0.0)
    return selected_examples
selected_training_examples = select_and_transform_features(training_examples)
selected_validation_examples = select_and_transform_featurestwo(validation_examples)
_ = train_model(
    learning_rate=0.01,
    steps=500,
    batch_size=5,
    training_examples=selected_training_examples,
    training_targets=training_targets,
    validation_examples=selected_validation_examples,
    validation_targets=validation_targets)