Data inspiration from Mike Fast's paper, "What the Heck is PITCHf/x?". Data from Brooks Baseball.
COLORS = {
'FF': 'red',
'SL': 'green',
'FS': 'blue',
'CH': 'orange',
'CU': 'grey',
'SI': 'purple',
'FT': 'brown',
}
PITCH_TYPE_LABELS = {
'FA': 'fastball',
'FF': 'four-seam fastball',
'FT': 'two-seam fastball',
'FC': 'cutter',
'FS': 'fastball',
'SI': 'sinker',
'FS': 'split-fingered fastball',
'SL': 'slider',
'CH': 'changeup',
'CB': 'curveball',
'CU': 'curveball',
'KC': 'knuckle-curve',
'KN': 'knuckleball',
'EP': 'eephus',
'UN': 'unidentified',
'XX': 'unidentified',
'PO': 'pitch out',
'FO': 'pitch out',
}
import pandas as pd
pitches = pd.read_csv('data/2019-06-17_547888_Tanaka.csv')
pitches.head()
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pitch_types = pitches['mlbam_pitch_name'].unique()
pitch_types
colors = [ COLORS.get(pitch_type, 'black') for pitch_type in pitches['mlbam_pitch_name']]
plt.scatter(pitches['pfx_x'], pitches['start_speed'], c=colors)
plt.title('Pitch horizontal break vs. pitch speed')
plt.xlabel('horizontal break (inches)')
plt.ylabel('pitch speed (mph)')
plt.scatter(pitches['pfx_x'], pitches['pfx_z'], c=colors)
plt.title('Pitch horizontal break vs. pitch vertical break')
plt.xlabel('horizontal break (inches)')
plt.ylabel('vertical break (inches)')
data_x = pitches['pfx_x']
data_y = pitches['start_speed']
data_z = pitches['pfx_z']
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
ax.scatter(data_x,data_y,data_z,c=colors)
cols_to_norm = ['start_speed', 'pfx_x', 'pfx_z']
pitches[cols_to_norm] = pitches[cols_to_norm].apply(lambda x: (x-x.min()) / (x.max() - x.min()))
pitches['pitch_type'] = pitches['mlbam_pitch_name'].apply(lambda type_code: int(np.where(pitch_types == type_code)[0]))
pitches.head()
import tensorflow as tf
start_speed = tf.feature_column.numeric_column('start_speed')
pfx_x = tf.feature_column.numeric_column('pfx_x')
pfx_z = tf.feature_column.numeric_column('pfx_z')
feat_cols = [start_speed, pfx_x, pfx_z]
pitches.info()
x_data = pitches.drop(['mlbam_pitch_name','pitch_type'], axis=1)
x_data.head()
labels = pitches['pitch_type']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_data, labels, test_size=0.33, random_state=101)
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True)
linear_model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=len(pitch_types))
linear_model.train(input_fn=input_func, steps=1000)
dnn_model = tf.estimator.DNNClassifier(feature_columns=feat_cols, hidden_units=[10,10], n_classes=len(pitch_types))
dnn_model.train(input_fn=input_func, steps=1000)
eval_input_func = tf.estimator.inputs.pandas_input_fn(
x=X_test,
y=y_test,
batch_size=10,
num_epochs=1,
shuffle=False
)
linear_results = linear_model.evaluate(eval_input_func)
linear_results
dnn_results = dnn_model.evaluate(eval_input_func)
dnn_results
pred_input_func = tf.estimator.inputs.pandas_input_fn(
x=X_test,
batch_size=10,
num_epochs=1,
shuffle=False
)
linear_predictions = list(linear_model.predict(pred_input_func))
linear_probabilities = [ [ round(i, 2) for i in pred['probabilities'] ] for pred in linear_predictions ]
linear_predicted_types = [ i.index(max(i)) for i in linear_probabilities ]
dnn_predictions = list(dnn_model.predict(pred_input_func))
dnn_probabilities = [ [ round(i, 2) for i in pred['probabilities'] ] for pred in dnn_predictions ]
dnn_predicted_types = [ i.index(max(i)) for i in dnn_probabilities ]
linear_predicted_pitches = [ PITCH_TYPE_LABELS[pitch_types[i]] for i in linear_predicted_types ]
dnn_predicted_pitches = [ PITCH_TYPE_LABELS[pitch_types[i]] for i in dnn_predicted_types ]
actual_pitches = [ PITCH_TYPE_LABELS[labeled_type] for labeled_type in list(pitches.iloc[X_test.index]['mlbam_pitch_name']) ]
comparison = pd.DataFrame({ 'actual': actual_pitches, 'linear': linear_predicted_pitches, 'dnn': dnn_predicted_pitches })
comparison