@@ -13,9 +13,11 @@ def _label_percentage(data_frame):
13
13
:return: label_percentage_dict: dictionary maps label : % of labels
14
14
"""
15
15
total_examples = len (data_frame )
16
- label_frequency_dict = dict (Counter (data_frame ['intent' ]).most_common ())
17
- percentage_list = np .array (list (label_frequency_dict .values ()))/ total_examples
18
- label_percentage_dict = dict (zip (list (label_frequency_dict .keys ()), percentage_list ))
16
+ label_frequency_dict = dict (Counter (data_frame ["intent" ]).most_common ())
17
+ percentage_list = np .array (list (label_frequency_dict .values ())) / total_examples
18
+ label_percentage_dict = dict (
19
+ zip (list (label_frequency_dict .keys ()), percentage_list )
20
+ )
19
21
return label_percentage_dict
20
22
21
23
@@ -26,15 +28,17 @@ def _train_test_coloring(val):
26
28
:return:
27
29
"""
28
30
if val > 25 :
29
- color = ' red'
31
+ color = " red"
30
32
elif val > 10 :
31
- color = ' DarkBlue'
33
+ color = " DarkBlue"
32
34
else :
33
- color = ' green'
34
- return ' color: %s' % color
35
+ color = " green"
36
+ return " color: %s" % color
35
37
36
38
37
- def _train_test_label_difference (workspace_label_percentage_dict , test_label_percentage_dict ):
39
+ def _train_test_label_difference (
40
+ workspace_label_percentage_dict , test_label_percentage_dict
41
+ ):
38
42
"""
39
43
analyze the difference between training set and test set
40
44
:param workspace_label_percentage_dict:
@@ -66,9 +70,11 @@ def _train_test_label_difference(workspace_label_percentage_dict, test_label_per
66
70
current_difference = np .abs (test_percentage - workspace_percentage )
67
71
68
72
if key in test_label_percentage_dict :
69
- difference_dict [key ] = [workspace_percentage * 100 ,
70
- test_percentage * 100 ,
71
- current_difference * 100 ]
73
+ difference_dict [key ] = [
74
+ workspace_percentage * 100 ,
75
+ test_percentage * 100 ,
76
+ current_difference * 100 ,
77
+ ]
72
78
73
79
js_distance = distance .jensenshannon (distribution1 , distribution2 , 2.0 )
74
80
@@ -86,8 +92,8 @@ def _train_test_vocab_difference(train_set_pd, test_set_pd):
86
92
"""
87
93
train_vocab = set ()
88
94
test_vocab = set ()
89
- train_set_tokens = train_set_pd [' utterance' ].apply (word_tokenize )
90
- test_set_tokens = test_set_pd [' utterance' ].apply (word_tokenize )
95
+ train_set_tokens = train_set_pd [" utterance" ].apply (word_tokenize )
96
+ test_set_tokens = test_set_pd [" utterance" ].apply (word_tokenize )
91
97
92
98
for tokens in train_set_tokens .tolist ():
93
99
train_vocab .update (tokens )
@@ -107,24 +113,26 @@ def _train_test_utterance_length_difference(train_set_pd, test_set_pd):
107
113
train_test_legnth_comparison: pandas dataframe [Intent, Absolute Difference]
108
114
"""
109
115
train_pd_temp = train_set_pd .copy ()
110
- train_pd_temp [' tokens' ] = train_set_pd [' utterance' ].apply (word_tokenize )
111
- train_pd_temp [' Train' ] = train_pd_temp [' tokens' ].apply (len )
112
- train_avg_len_by_label = train_pd_temp [[' intent' , ' Train' ]].groupby (' intent' ).mean ()
116
+ train_pd_temp [" tokens" ] = train_set_pd [" utterance" ].apply (word_tokenize )
117
+ train_pd_temp [" Train" ] = train_pd_temp [" tokens" ].apply (len )
118
+ train_avg_len_by_label = train_pd_temp [[" intent" , " Train" ]].groupby (" intent" ).mean ()
113
119
114
120
test_pd_temp = test_set_pd .copy ()
115
- test_pd_temp ['tokens' ] = test_set_pd ['utterance' ].apply (word_tokenize )
116
- test_pd_temp ['Test' ] = test_pd_temp ['tokens' ].apply (len )
117
- test_avg_len_by_label = test_pd_temp [['intent' , 'Test' ]].groupby ('intent' ).mean ()
118
-
119
- train_test_length_comparison = pd .merge (train_avg_len_by_label ,
120
- test_avg_len_by_label , on = 'intent' )
121
- train_test_length_comparison ['Absolute Difference' ] = \
122
- np .abs (train_test_length_comparison ['Train' ] - train_test_length_comparison ['Test' ])
121
+ test_pd_temp ["tokens" ] = test_set_pd ["utterance" ].apply (word_tokenize )
122
+ test_pd_temp ["Test" ] = test_pd_temp ["tokens" ].apply (len )
123
+ test_avg_len_by_label = test_pd_temp [["intent" , "Test" ]].groupby ("intent" ).mean ()
124
+
125
+ train_test_length_comparison = pd .merge (
126
+ train_avg_len_by_label , test_avg_len_by_label , on = "intent"
127
+ )
128
+ train_test_length_comparison ["Absolute Difference" ] = np .abs (
129
+ train_test_length_comparison ["Train" ] - train_test_length_comparison ["Test" ]
130
+ )
123
131
train_test_length_comparison = train_test_length_comparison .sort_values (
124
- by = ["Absolute Difference" ], ascending = False )
132
+ by = ["Absolute Difference" ], ascending = False
133
+ )
125
134
train_test_length_comparison = train_test_length_comparison .reset_index ()
126
- train_test_length_comparison .rename (columns = {'intent' :'Intent'
127
- }, inplace = True )
135
+ train_test_length_comparison .rename (columns = {"intent" : "Intent" }, inplace = True )
128
136
return train_test_length_comparison
129
137
130
138
@@ -137,8 +145,8 @@ def _get_metrics(results):
137
145
recall_dict: maps the {intent: recall}
138
146
f1_dict: maps the {intent:f1}
139
147
"""
140
- groundtruth = results [' correct_intent' ].values .tolist ()
141
- top_intent = results [' top_intent' ].values .tolist ()
148
+ groundtruth = results [" correct_intent" ].values .tolist ()
149
+ top_intent = results [" top_intent" ].values .tolist ()
142
150
gt_cnt_dict = dict ()
143
151
pred_cnt_dict = dict ()
144
152
true_positive_dict = dict ()
@@ -152,13 +160,22 @@ def _get_metrics(results):
152
160
f1_dict = dict ()
153
161
for lb in true_positive_dict :
154
162
155
- recall_dict [lb ] = true_positive_dict [lb ] / gt_cnt_dict [lb ] if lb in gt_cnt_dict else 0
156
-
157
- precision_dict [lb ] = true_positive_dict [lb ] / pred_cnt_dict [lb ] if lb in pred_cnt_dict \
158
- else 0
159
-
160
- f1_dict [lb ] = 0.0 if recall_dict [lb ] == 0 and precision_dict [lb ] == 0 \
161
- else 2.0 * recall_dict [lb ] * precision_dict [lb ] / (recall_dict [lb ] + precision_dict [lb ])
163
+ recall_dict [lb ] = (
164
+ true_positive_dict [lb ] / gt_cnt_dict [lb ] if lb in gt_cnt_dict else 0
165
+ )
166
+
167
+ precision_dict [lb ] = (
168
+ true_positive_dict [lb ] / pred_cnt_dict [lb ] if lb in pred_cnt_dict else 0
169
+ )
170
+
171
+ f1_dict [lb ] = (
172
+ 0.0
173
+ if recall_dict [lb ] == 0 and precision_dict [lb ] == 0
174
+ else 2.0
175
+ * recall_dict [lb ]
176
+ * precision_dict [lb ]
177
+ / (recall_dict [lb ] + precision_dict [lb ])
178
+ )
162
179
return precision_dict , recall_dict , f1_dict
163
180
164
181
@@ -172,12 +189,14 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
172
189
workspace_label_percentage_dict = _label_percentage (train_set_pd )
173
190
test_label_percentage_dict = _label_percentage (test_set_pd )
174
191
175
- missing_label , difference_dict , js = \
176
- _train_test_label_difference (workspace_label_percentage_dict , test_label_percentage_dict )
192
+ missing_label , difference_dict , js = _train_test_label_difference (
193
+ workspace_label_percentage_dict , test_label_percentage_dict
194
+ )
177
195
train_vocab , test_vocab = _train_test_vocab_difference (train_set_pd , test_set_pd )
178
196
179
- train_test_length_comparison_pd = \
180
- _train_test_utterance_length_difference (train_set_pd , test_set_pd )
197
+ train_test_length_comparison_pd = _train_test_utterance_length_difference (
198
+ train_set_pd , test_set_pd
199
+ )
181
200
182
201
display (Markdown ("## Test Data Evaluation" ))
183
202
@@ -186,35 +205,43 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
186
205
label = list (difference_dict .keys ())
187
206
diff = np .round (list (difference_dict .values ()), 2 )
188
207
precision_dict , recall_dict , f1_dict = _get_metrics (results )
189
- precision = np .round ([precision_dict [l ]* 100.0 if l in precision_dict else 0.0
190
- for l in label ], 2 )
208
+ precision = np .round (
209
+ [precision_dict [l ] * 100.0 if l in precision_dict else 0.0 for l in label ],
210
+ 2 ,
211
+ )
191
212
192
- recall = np .round ([recall_dict [l ]* 100.0 if l in recall_dict else 0.0 for l in label ], 2 )
213
+ recall = np .round (
214
+ [recall_dict [l ] * 100.0 if l in recall_dict else 0.0 for l in label ], 2
215
+ )
193
216
194
- f1 = np .round ([f1_dict [l ]* 100.0 if l in f1_dict else 0.0 for l in label ], 2 )
217
+ f1 = np .round ([f1_dict [l ] * 100.0 if l in f1_dict else 0.0 for l in label ], 2 )
195
218
196
- train_count_dict = dict (Counter (train_set_pd [' intent' ]))
197
- test_count_dict = dict (Counter (test_set_pd [' intent' ]))
219
+ train_count_dict = dict (Counter (train_set_pd [" intent" ]))
220
+ test_count_dict = dict (Counter (test_set_pd [" intent" ]))
198
221
tr_cnt = [train_count_dict [l ] if l in train_count_dict else 0.0 for l in label ]
199
222
te_cnt = [test_count_dict [l ] if l in test_count_dict else 0.0 for l in label ]
200
223
201
- difference_pd = pd .DataFrame ({"Intent" : label ,
202
- "% of Train" : diff [:, 0 ],
203
- "% of Test" : diff [:, 1 ],
204
- "Absolute Difference %" : diff [:, 2 ],
205
- "Train Examples" : tr_cnt ,
206
- "Test Examples" : te_cnt ,
207
- "Test Precision %" : precision ,
208
- "Test Recall %" : recall ,
209
- "Test F1 %" : f1 })
210
-
211
- if not difference_pd [difference_pd ["Absolute Difference %" ] > .001 ].empty :
212
- table_for_display = difference_pd [difference_pd ["Absolute Difference %" ]
213
- > .001 ].sort_values (by = ["Absolute Difference %" ],
214
- ascending = False )
215
- table_for_display = \
216
- table_for_display .style .applymap (_train_test_coloring ,
217
- subset = pd .IndexSlice [:, ["Absolute Difference %" ]])
224
+ difference_pd = pd .DataFrame (
225
+ {
226
+ "Intent" : label ,
227
+ "% of Train" : diff [:, 0 ],
228
+ "% of Test" : diff [:, 1 ],
229
+ "Absolute Difference %" : diff [:, 2 ],
230
+ "Train Examples" : tr_cnt ,
231
+ "Test Examples" : te_cnt ,
232
+ "Test Precision %" : precision ,
233
+ "Test Recall %" : recall ,
234
+ "Test F1 %" : f1 ,
235
+ }
236
+ )
237
+
238
+ if not difference_pd [difference_pd ["Absolute Difference %" ] > 0.001 ].empty :
239
+ table_for_display = difference_pd [
240
+ difference_pd ["Absolute Difference %" ] > 0.001
241
+ ].sort_values (by = ["Absolute Difference %" ], ascending = False )
242
+ table_for_display = table_for_display .style .applymap (
243
+ _train_test_coloring , subset = pd .IndexSlice [:, ["Absolute Difference %" ]]
244
+ )
218
245
display (table_for_display )
219
246
display (Markdown ("\n " ))
220
247
display (Markdown ("Distribution Mismatch Color Code" ))
@@ -223,42 +250,61 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
223
250
display (Markdown ("<font color = 'green'> Green - Good </font>" ))
224
251
225
252
if js >= 0 :
226
- js = np .round (js , 2 )* 100
227
- display (Markdown ("### Data Distribution Divergence Test vs Train \
228
- <font color='blue'>{}%</font>" .format (js )))
253
+ js = np .round (js , 2 ) * 100
254
+ display (
255
+ Markdown (
256
+ "### Data Distribution Divergence Test vs Train \
257
+ <font color='blue'>{}%</font>" .format (
258
+ js
259
+ )
260
+ )
261
+ )
229
262
display (Markdown ("**Note** Metric used is Jensen Shannon Distance" ))
230
263
231
264
if missing_label :
232
265
display (Markdown ("### Missing Intents in Test Data" ))
233
- missing_label_pd = pd .DataFrame (missing_label ,
234
- columns = ["Missing Intents in Test Set " ])
235
- missing_label_pd .index = np .arange (1 , len (missing_label_pd )+ 1 )
266
+ missing_label_pd = pd .DataFrame (
267
+ missing_label , columns = ["Missing Intents in Test Set " ]
268
+ )
269
+ missing_label_pd .index = np .arange (1 , len (missing_label_pd ) + 1 )
236
270
display (missing_label_pd )
237
271
238
272
display (Markdown ("### Test Data Example Length" ))
239
- condition1 = (train_test_length_comparison_pd ["Absolute Difference" ] /
240
- train_test_length_comparison_pd ["Train" ] > .3 )
241
- condition2 = (train_test_length_comparison_pd ["Absolute Difference" ] > 3 )
273
+ condition1 = (
274
+ train_test_length_comparison_pd ["Absolute Difference" ]
275
+ / train_test_length_comparison_pd ["Train" ]
276
+ > 0.3
277
+ )
278
+ condition2 = train_test_length_comparison_pd ["Absolute Difference" ] > 3
242
279
243
280
length_comparison_pd = train_test_length_comparison_pd [condition1 & condition2 ]
244
281
245
282
if not length_comparison_pd .empty :
246
- display (Markdown (
247
- "Divergence found in average length of user examples in test vs training data" ))
248
- length_comparison_pd .index = np .arange (1 , len (length_comparison_pd )+ 1 )
283
+ display (
284
+ Markdown (
285
+ "Divergence found in average length of user examples in test vs training data"
286
+ )
287
+ )
288
+ length_comparison_pd .index = np .arange (1 , len (length_comparison_pd ) + 1 )
249
289
display (length_comparison_pd .round (2 ))
250
290
else :
251
291
display (Markdown ("Average length of user examples is comparable" ))
252
292
253
293
if train_vocab and test_vocab :
254
294
display (Markdown ("### Vocabulary Size Test vs Train" ))
255
- oov_vocab_percentage = (len (test_vocab ) - len (train_vocab .intersection (test_vocab ))) \
256
- / len (test_vocab )* 100
257
-
258
- vocab_df = pd .DataFrame (data = {
259
- 'Train Vocabulary Size' : [len (train_vocab )],
260
- 'Test Vocabulary Size' : [len (test_vocab )],
261
- '% Test Set Vocabulary not found in Train' : [oov_vocab_percentage ]})
295
+ oov_vocab_percentage = (
296
+ (len (test_vocab ) - len (train_vocab .intersection (test_vocab )))
297
+ / len (test_vocab )
298
+ * 100
299
+ )
300
+
301
+ vocab_df = pd .DataFrame (
302
+ data = {
303
+ "Train Vocabulary Size" : [len (train_vocab )],
304
+ "Test Vocabulary Size" : [len (test_vocab )],
305
+ "% Test Set Vocabulary not found in Train" : [oov_vocab_percentage ],
306
+ }
307
+ )
262
308
vocab_df .index = np .arange (1 , len (vocab_df ) + 1 )
263
309
display (vocab_df .round (2 ))
264
310
0 commit comments