Pitch Classifier¶

Data inspiration from Mike Fast's paper, "What the Heck is PITCHf/x?". Data from Brooks Baseball.

COLORS = {
    'FF': 'red',
    'SL': 'green',
    'FS': 'blue',
    'CH': 'orange',
    'CU': 'grey',
    'SI': 'purple',
    'FT': 'brown',
}

PITCH_TYPE_LABELS = {
    'FA': 'fastball',
    'FF': 'four-seam fastball',
    'FT': 'two-seam fastball',
    'FC': 'cutter',
    'FS': 'fastball',
    'SI': 'sinker',
    'FS': 'split-fingered fastball',
    'SL': 'slider',
    'CH': 'changeup',
    'CB': 'curveball',
    'CU': 'curveball',
    'KC': 'knuckle-curve',
    'KN': 'knuckleball',
    'EP': 'eephus',
    'UN': 'unidentified',
    'XX': 'unidentified',
    'PO': 'pitch out',
    'FO': 'pitch out',
}

Load the Data¶

import pandas as pd

pitches = pd.read_csv('data/2019-06-17_547888_Tanaka.csv')
pitches.head()

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

pitch_types = pitches['mlbam_pitch_name'].unique()
pitch_types

array(['FF', 'SL', 'FS', 'CH', 'CU', 'SI', 'FT'], dtype=object)

colors = [ COLORS.get(pitch_type, 'black') for pitch_type in pitches['mlbam_pitch_name']]

plt.scatter(pitches['pfx_x'], pitches['start_speed'], c=colors)
plt.title('Pitch horizontal break vs. pitch speed')
plt.xlabel('horizontal break (inches)')
plt.ylabel('pitch speed (mph)')

Text(0, 0.5, 'pitch speed (mph)')

plt.scatter(pitches['pfx_x'], pitches['pfx_z'], c=colors)
plt.title('Pitch horizontal break vs. pitch vertical break')
plt.xlabel('horizontal break (inches)')
plt.ylabel('vertical break (inches)')

Text(0, 0.5, 'vertical break (inches)')

data_x = pitches['pfx_x']
data_y = pitches['start_speed']
data_z = pitches['pfx_z']

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111,projection='3d')
ax.scatter(data_x,data_y,data_z,c=colors)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x11f9def98>

Normalize the Data¶

cols_to_norm = ['start_speed', 'pfx_x', 'pfx_z']

pitches[cols_to_norm] = pitches[cols_to_norm].apply(lambda x: (x-x.min()) / (x.max() - x.min()))
pitches['pitch_type'] = pitches['mlbam_pitch_name'].apply(lambda type_code: int(np.where(pitch_types == type_code)[0]))

pitches.head()

Define Feature Columns¶

import tensorflow as tf

start_speed = tf.feature_column.numeric_column('start_speed')
pfx_x = tf.feature_column.numeric_column('pfx_x')
pfx_z = tf.feature_column.numeric_column('pfx_z')

feat_cols = [start_speed, pfx_x, pfx_z]

Train Test Split¶

pitches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 5 columns):
start_speed         95 non-null float64
pfx_x               95 non-null float64
pfx_z               95 non-null float64
mlbam_pitch_name    95 non-null object
pitch_type          95 non-null int64
dtypes: float64(3), int64(1), object(1)
memory usage: 3.8+ KB

x_data = pitches.drop(['mlbam_pitch_name','pitch_type'], axis=1)
x_data.head()

labels = pitches['pitch_type']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_data, labels, test_size=0.33, random_state=101)

Input Function¶

input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True)

linear_model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=len(pitch_types))

INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_model_dir': '/var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz', '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_tf_random_seed': 1, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_save_summary_steps': 100}

linear_model.train(input_fn=input_func, steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt.
INFO:tensorflow:step = 1, loss = 19.4591
INFO:tensorflow:global_step/sec: 608.276
INFO:tensorflow:step = 101, loss = 10.030265 (0.165 sec)
INFO:tensorflow:global_step/sec: 618.628
INFO:tensorflow:step = 201, loss = 8.143104 (0.162 sec)
INFO:tensorflow:global_step/sec: 573.046
INFO:tensorflow:step = 301, loss = 9.42222 (0.174 sec)
INFO:tensorflow:global_step/sec: 596.99
INFO:tensorflow:step = 401, loss = 9.90423 (0.167 sec)
INFO:tensorflow:global_step/sec: 654.746
INFO:tensorflow:step = 501, loss = 6.442497 (0.153 sec)
INFO:tensorflow:global_step/sec: 587.109
INFO:tensorflow:step = 601, loss = 10.633189 (0.173 sec)
INFO:tensorflow:global_step/sec: 575.238
INFO:tensorflow:step = 701, loss = 11.367533 (0.172 sec)
INFO:tensorflow:global_step/sec: 624.504
INFO:tensorflow:step = 801, loss = 7.041391 (0.160 sec)
INFO:tensorflow:global_step/sec: 594.778
INFO:tensorflow:step = 901, loss = 2.4444485 (0.168 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt.
INFO:tensorflow:Loss for final step: 6.4268947.

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x1a29eefcc0>

dnn_model = tf.estimator.DNNClassifier(feature_columns=feat_cols, hidden_units=[10,10], n_classes=len(pitch_types))

INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb
INFO:tensorflow:Using config: {'_save_checkpoints_steps': None, '_model_dir': '/var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb', '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_tf_random_seed': 1, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_save_summary_steps': 100}

dnn_model.train(input_fn=input_func, steps=1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt.
INFO:tensorflow:step = 1, loss = 19.246891
INFO:tensorflow:global_step/sec: 587.699
INFO:tensorflow:step = 101, loss = 6.7308846 (0.171 sec)
INFO:tensorflow:global_step/sec: 585.563
INFO:tensorflow:step = 201, loss = 4.717022 (0.171 sec)
INFO:tensorflow:global_step/sec: 594.417
INFO:tensorflow:step = 301, loss = 5.4684024 (0.168 sec)
INFO:tensorflow:global_step/sec: 597.153
INFO:tensorflow:step = 401, loss = 7.000417 (0.168 sec)
INFO:tensorflow:global_step/sec: 627.062
INFO:tensorflow:step = 501, loss = 3.6366892 (0.159 sec)
INFO:tensorflow:global_step/sec: 604.28
INFO:tensorflow:step = 601, loss = 5.8083935 (0.166 sec)
INFO:tensorflow:global_step/sec: 613.456
INFO:tensorflow:step = 701, loss = 3.27341 (0.166 sec)
INFO:tensorflow:global_step/sec: 617.879
INFO:tensorflow:step = 801, loss = 4.890299 (0.159 sec)
INFO:tensorflow:global_step/sec: 547.091
INFO:tensorflow:step = 901, loss = 2.546061 (0.183 sec)
INFO:tensorflow:Saving checkpoints for 1000 into /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt.
INFO:tensorflow:Loss for final step: 4.399426.

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1a2a4a4d68>

Evaluation¶

eval_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    y=y_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

linear_results = linear_model.evaluate(eval_input_func)

INFO:tensorflow:Starting evaluation at 2019-07-19-11:23:05
INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2019-07-19-11:23:06
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.71875, average_loss = 0.9591996, global_step = 1000, loss = 7.673597

linear_results

{'accuracy': 0.71875,
 'average_loss': 0.9591996,
 'global_step': 1000,
 'loss': 7.673597}

dnn_results = dnn_model.evaluate(eval_input_func)

INFO:tensorflow:Starting evaluation at 2019-07-19-11:23:07
INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt-1000
INFO:tensorflow:Finished evaluation at 2019-07-19-11:23:07
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.84375, average_loss = 0.7265939, global_step = 1000, loss = 5.8127513

dnn_results

{'accuracy': 0.84375,
 'average_loss': 0.7265939,
 'global_step': 1000,
 'loss': 5.8127513}

Predictions¶

pred_input_func = tf.estimator.inputs.pandas_input_fn(
    x=X_test,
    batch_size=10,
    num_epochs=1,
    shuffle=False
)

linear_predictions = list(linear_model.predict(pred_input_func))

INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpyyv97wkz/model.ckpt-1000

linear_probabilities = [ [ round(i, 2) for i in pred['probabilities'] ] for pred in linear_predictions ]
linear_predicted_types = [ i.index(max(i)) for i in linear_probabilities ]

dnn_predictions = list(dnn_model.predict(pred_input_func))

INFO:tensorflow:Restoring parameters from /var/folders/kb/s3s7jjy90yx_6r9ztgl83y_m0000gp/T/tmpxjwyuxsb/model.ckpt-1000

dnn_probabilities = [ [ round(i, 2) for i in pred['probabilities'] ] for pred in dnn_predictions ]
dnn_predicted_types = [ i.index(max(i)) for i in dnn_probabilities ]

linear_predicted_pitches = [ PITCH_TYPE_LABELS[pitch_types[i]] for i in linear_predicted_types ]

dnn_predicted_pitches = [ PITCH_TYPE_LABELS[pitch_types[i]] for i in dnn_predicted_types ]

actual_pitches = [ PITCH_TYPE_LABELS[labeled_type] for labeled_type in list(pitches.iloc[X_test.index]['mlbam_pitch_name']) ]

comparison = pd.DataFrame({ 'actual': actual_pitches, 'linear': linear_predicted_pitches, 'dnn': dnn_predicted_pitches })
comparison

	start_speed	pfx_x	pfx_z	mlbam_pitch_name
0	91.87	-4.517746	9.332202	FF
1	83.05	1.624604	1.942790	SL
2	91.21	-7.260078	7.255023	FF
3	81.80	4.409871	0.787128	SL
4	80.63	5.149924	2.596144	SL

	start_speed	pfx_x	pfx_z	mlbam_pitch_name	pitch_type
0	0.926975	0.339739	0.970198	FF	0
1	0.488823	0.660851	0.508245	SL	1
2	0.894188	0.196375	0.840342	FF	0
3	0.426726	0.806460	0.435998	SL	1
4	0.368604	0.845149	0.549089	SL	1

	start_speed	pfx_x	pfx_z
0	0.926975	0.339739	0.970198
1	0.488823	0.660851	0.508245
2	0.894188	0.196375	0.840342
3	0.426726	0.806460	0.435998
4	0.368604	0.845149	0.549089

	actual	dnn	linear
0	four-seam fastball	four-seam fastball	four-seam fastball
1	slider	slider	slider
2	sinker	sinker	split-fingered fastball
3	split-fingered fastball	split-fingered fastball	split-fingered fastball
4	four-seam fastball	four-seam fastball	four-seam fastball
5	four-seam fastball	four-seam fastball	four-seam fastball
6	four-seam fastball	four-seam fastball	four-seam fastball
7	changeup	split-fingered fastball	four-seam fastball
8	slider	slider	slider
9	slider	split-fingered fastball	four-seam fastball
10	split-fingered fastball	split-fingered fastball	split-fingered fastball
11	changeup	split-fingered fastball	split-fingered fastball
12	slider	slider	slider
13	slider	slider	slider
14	slider	slider	slider
15	slider	slider	slider
16	curveball	curveball	slider
17	slider	slider	slider
18	curveball	curveball	slider
19	four-seam fastball	four-seam fastball	four-seam fastball
20	four-seam fastball	four-seam fastball	four-seam fastball
21	changeup	split-fingered fastball	split-fingered fastball
22	slider	slider	slider
23	slider	slider	slider
24	curveball	slider	slider
25	slider	slider	slider
26	curveball	curveball	slider
27	split-fingered fastball	split-fingered fastball	split-fingered fastball
28	slider	slider	slider
29	slider	slider	slider
30	four-seam fastball	four-seam fastball	four-seam fastball
31	slider	slider	slider