Pitch Classifier

Data inspiration from Mike Fast's paper, "What the Heck is PITCHf/x?". Data from Brooks Baseball.

In [1]:
COLORS = {
    'FF': 'red',
    'SL': 'green',
    'FS': 'blue',
    'CH': 'orange',
    'CU': 'grey',
    'SI': 'purple',
    'FT': 'brown',
}
In [2]:
PITCH_TYPE_LABELS = {
    'FA': 'fastball',
    'FF': 'four-seam fastball',
    'FT': 'two-seam fastball',
    'FC': 'cutter',
    'FS': 'fastball',
    'SI': 'sinker',
    'FS': 'split-fingered fastball',
    'SL': 'slider',
    'CH': 'changeup',
    'CB': 'curveball',
    'CU': 'curveball',
    'KC': 'knuckle-curve',
    'KN': 'knuckleball',
    'EP': 'eephus',
    'UN': 'unidentified',
    'XX': 'unidentified',
    'PO': 'pitch out',
    'FO': 'pitch out',
}

Load the Data

In [3]:
import pandas as pd
In [4]:
pitches = pd.read_csv('data/2019-06-17_547888_Tanaka.csv')
pitches.head()
Out[4]:
start_speed pfx_x pfx_z mlbam_pitch_name
0 91.87 -4.517746 9.332202 FF
1 83.05 1.624604 1.942790 SL
2 91.21 -7.260078 7.255023 FF
3 81.80 4.409871 0.787128 SL
4 80.63 5.149924 2.596144 SL
In [5]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [6]:
pitch_types = pitches['mlbam_pitch_name'].unique()
pitch_types
Out[6]:
array(['FF', 'SL', 'FS', 'CH', 'CU', 'SI', 'FT'], dtype=object)
In [7]:
colors = [ COLORS.get(pitch_type, 'black') for pitch_type in pitches['mlbam_pitch_name']]
In [8]:
plt.scatter(pitches['pfx_x'], pitches['start_speed'], c=colors)
plt.title('Pitch horizontal break vs. pitch speed')
plt.xlabel('horizontal break (inches)')
plt.ylabel('pitch speed (mph)')
Out[8]:
Text(0, 0.5, 'pitch speed (mph)')
In [9]:
plt.scatter(pitches['pfx_x'], pitches['pfx_z'], c=colors)
plt.title('Pitch horizontal break vs. pitch vertical break')
plt.xlabel('horizontal break (inches)')
plt.ylabel('vertical break (inches)')
Out[9]:
Text(0, 0.5, 'vertical break (inches)')
In [10]:
data_x = pitches['pfx_x']
data_y = pitches['start_speed']
data_z = pitches['pfx_z']
In [11]:
from mpl_toolkits.mplot3d import Axes3D
In [12]:
fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
ax.scatter(data_x,data_y,data_z,c=colors)
Out[12]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x11f9def98>

Normalize the Data

In [13]:
cols_to_norm = ['start_speed', 'pfx_x', 'pfx_z']
In [14]:
pitches[cols_to_norm] = pitches[cols_to_norm].apply(lambda x: (x-x.min()) / (x.max() - x.min()))
pitches['pitch_type'] = pitches['mlbam_pitch_name'].apply(lambda type_code: int(np.where(pitch_types == type_code)[0]))
In [15]:
pitches.head()
Out[15]:
start_speed pfx_x pfx_z mlbam_pitch_name pitch_type
0 0.926975 0.339739 0.970198 FF 0
1 0.488823 0.660851 0.508245 SL 1
2 0.894188 0.196375 0.840342 FF 0
3 0.426726 0.806460 0.435998 SL 1
4 0.368604 0.845149 0.549089 SL 1

Define Feature Columns

In [16]:
import tensorflow as tf
In [17]:
start_speed = tf.feature_column.numeric_column('start_speed')
pfx_x = tf.feature_column.numeric_column('pfx_x')
pfx_z = tf.feature_column.numeric_column('pfx_z')
In [18]:
feat_cols = [start_speed, pfx_x, pfx_z]

Train Test Split

In [19]:
pitches.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 5 columns):
start_speed         95 non-null float64
pfx_x               95 non-null float64
pfx_z               95 non-null float64
mlbam_pitch_name    95 non-null object
pitch_type          95 non-null int64
dtypes: float64(3), int64(1), object(1)
memory usage: 3.8+ KB
In [20]:
x_data = pitches.drop(['mlbam_pitch_name','pitch_type'], axis=1)
x_data.head()
Out[20]:
start_speed pfx_x pfx_z
0 0.926975 0.339739 0.970198
1 0.488823 0.660851 0.508245
2 0.894188 0.196375 0.840342
3 0.426726 0.806460 0.435998
4 0.368604 0.845149 0.549089
In [21]:
labels = pitches['pitch_type']
In [22]:
from sklearn.model_selection import train_test_split
In [23]:
X_train, X_test, y_train, y_test = train_test_split(x_data, labels, test_size=0.33, random_state=101)

Input Function

In [24]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True)
In [25]:
linear_model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=len(pitch_types))
INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_model_dir': '/var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz', '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_tf_random_seed': 1, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_save_summary_steps': 100}
In [26]:
linear_model.train(input_fn=input_func, steps=1000)
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt.
INFO:tensorflow:step = 1, loss = 19.4591
INFO:tensorflow:global_step/sec: 608.276
INFO:tensorflow:step = 101, loss = 10.030265 (0.165 sec)
INFO:tensorflow:global_step/sec: 618.628
INFO:tensorflow:step = 201, loss = 8.143104 (0.162 sec)
INFO:tensorflow:global_step/sec: 573.046
INFO:tensorflow:step = 301, loss = 9.42222 (0.174 sec)
INFO:tensorflow:global_step/sec: 596.99
INFO:tensorflow:step = 401, loss = 9.90423 (0.167 sec)
INFO:tensorflow:global_step/sec: 654.746
INFO:tensorflow:step = 501, loss = 6.442497 (0.153 sec)
INFO:tensorflow:global_step/sec: 587.109
INFO:tensorflow:step = 601, loss = 10.633189 (0.173 sec)
INFO:tensorflow:global_step/sec: 575.238
INFO:tensorflow:step = 701, loss = 11.367533 (0.172 sec)
INFO:tensorflow:global_step/sec: 624.504
INFO:tensorflow:step = 801, loss = 7.041391 (0.160 sec)
INFO:tensorflow:global_step/sec: 594.778
INFO:tensorflow:step = 901, loss = 2.4444485 (0.168 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt.
INFO:tensorflow:Loss for final step: 6.4268947.
Out[26]:
<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1a29eefcc0>
In [27]:
dnn_model = tf.estimator.DNNClassifier(feature_columns=feat_cols, hidden_units=[10,10], n_classes=len(pitch_types))
INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_model_dir': '/var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb', '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_tf_random_seed': 1, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_save_summary_steps': 100}
In [28]:
dnn_model.train(input_fn=input_func, steps=1000)
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt.
INFO:tensorflow:step = 1, loss = 19.246891
INFO:tensorflow:global_step/sec: 587.699
INFO:tensorflow:step = 101, loss = 6.7308846 (0.171 sec)
INFO:tensorflow:global_step/sec: 585.563
INFO:tensorflow:step = 201, loss = 4.717022 (0.171 sec)
INFO:tensorflow:global_step/sec: 594.417
INFO:tensorflow:step = 301, loss = 5.4684024 (0.168 sec)
INFO:tensorflow:global_step/sec: 597.153
INFO:tensorflow:step = 401, loss = 7.000417 (0.168 sec)
INFO:tensorflow:global_step/sec: 627.062
INFO:tensorflow:step = 501, loss = 3.6366892 (0.159 sec)
INFO:tensorflow:global_step/sec: 604.28
INFO:tensorflow:step = 601, loss = 5.8083935 (0.166 sec)
INFO:tensorflow:global_step/sec: 613.456
INFO:tensorflow:step = 701, loss = 3.27341 (0.166 sec)
INFO:tensorflow:global_step/sec: 617.879
INFO:tensorflow:step = 801, loss = 4.890299 (0.159 sec)
INFO:tensorflow:global_step/sec: 547.091
INFO:tensorflow:step = 901, loss = 2.546061 (0.183 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt.
INFO:tensorflow:Loss for final step: 4.399426.
Out[28]:
<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1a2a4a4d68>

Evaluation

In [29]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    y=y_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)
In [30]:
linear_results = linear_model.evaluate(eval_input_func)
INFO:tensorflow:Starting evaluation at 2019-07-19-11:23:05
INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2019-07-19-11:23:06
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.71875, average_loss = 0.9591996, global_step = 1000, loss = 7.673597
In [31]:
linear_results
Out[31]:
{'accuracy': 0.71875,
 'average_loss': 0.9591996,
 'global_step': 1000,
 'loss': 7.673597}
In [32]:
dnn_results = dnn_model.evaluate(eval_input_func)
INFO:tensorflow:Starting evaluation at 2019-07-19-11:23:07
INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2019-07-19-11:23:07
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.84375, average_loss = 0.7265939, global_step = 1000, loss = 5.8127513
In [33]:
dnn_results
Out[33]:
{'accuracy': 0.84375,
 'average_loss': 0.7265939,
 'global_step': 1000,
 'loss': 5.8127513}

Predictions

In [34]:
pred_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)
In [35]:
linear_predictions = list(linear_model.predict(pred_input_func))
INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt-1000
In [36]:
linear_probabilities = [ [ round(i, 2) for i in pred['probabilities'] ] for pred in linear_predictions ]
linear_predicted_types = [ i.index(max(i)) for i in linear_probabilities ]
In [37]:
dnn_predictions = list(dnn_model.predict(pred_input_func))
INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt-1000
In [38]:
dnn_probabilities = [ [ round(i, 2) for i in pred['probabilities'] ] for pred in dnn_predictions ]
dnn_predicted_types = [ i.index(max(i)) for i in dnn_probabilities ]
In [39]:
linear_predicted_pitches = [ PITCH_TYPE_LABELS[pitch_types[i]] for i in linear_predicted_types ]
In [40]:
dnn_predicted_pitches = [ PITCH_TYPE_LABELS[pitch_types[i]] for i in dnn_predicted_types ]
In [41]:
actual_pitches = [ PITCH_TYPE_LABELS[labeled_type] for labeled_type in list(pitches.iloc[X_test.index]['mlbam_pitch_name']) ]
In [42]:
comparison = pd.DataFrame({ 'actual': actual_pitches, 'linear': linear_predicted_pitches, 'dnn': dnn_predicted_pitches })
comparison
Out[42]:
actual dnn linear
0 four-seam fastball four-seam fastball four-seam fastball
1 slider slider slider
2 sinker sinker split-fingered fastball
3 split-fingered fastball split-fingered fastball split-fingered fastball
4 four-seam fastball four-seam fastball four-seam fastball
5 four-seam fastball four-seam fastball four-seam fastball
6 four-seam fastball four-seam fastball four-seam fastball
7 changeup split-fingered fastball four-seam fastball
8 slider slider slider
9 slider split-fingered fastball four-seam fastball
10 split-fingered fastball split-fingered fastball split-fingered fastball
11 changeup split-fingered fastball split-fingered fastball
12 slider slider slider
13 slider slider slider
14 slider slider slider
15 slider slider slider
16 curveball curveball slider
17 slider slider slider
18 curveball curveball slider
19 four-seam fastball four-seam fastball four-seam fastball
20 four-seam fastball four-seam fastball four-seam fastball
21 changeup split-fingered fastball split-fingered fastball
22 slider slider slider
23 slider slider slider
24 curveball slider slider
25 slider slider slider
26 curveball curveball slider
27 split-fingered fastball split-fingered fastball split-fingered fastball
28 slider slider slider
29 slider slider slider
30 four-seam fastball four-seam fastball four-seam fastball
31 slider slider slider
In [ ]: