Skip to content

Commit 6a4c729

Browse files
haodeqinavneetrao
authored andcommitted
feat(notebook): Update notebook to align with IBM Gallery (#17)
* fix(chi2): update chi2 to keep single character * chore(all): reformat everything * feat(notebook): align notebook with studio version with minior doc updates * test(end2end): update end2end test * chore(version): update version to 1.1.0 * chore(license): update license information
1 parent ea17fda commit 6a4c729

File tree

16 files changed

+1326
-785
lines changed

16 files changed

+1326
-785
lines changed

_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Semantic versioning
22
# MAJOR.MINOR.PATCH
33

4-
__version__ = '1.0.2'
4+
__version__ = '1.1.0'
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '1.0.2'
1+
__version__ = "1.1.0"

assistant_dialog_skill_analysis/confidence_analysis/confidence_analyzer.py

Lines changed: 226 additions & 120 deletions
Large diffs are not rendered by default.

assistant_dialog_skill_analysis/data_analysis/divergence_analyzer.py

Lines changed: 128 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ def _label_percentage(data_frame):
1313
:return: label_percentage_dict: dictionary maps label : % of labels
1414
"""
1515
total_examples = len(data_frame)
16-
label_frequency_dict = dict(Counter(data_frame['intent']).most_common())
17-
percentage_list = np.array(list(label_frequency_dict.values()))/total_examples
18-
label_percentage_dict = dict(zip(list(label_frequency_dict.keys()), percentage_list))
16+
label_frequency_dict = dict(Counter(data_frame["intent"]).most_common())
17+
percentage_list = np.array(list(label_frequency_dict.values())) / total_examples
18+
label_percentage_dict = dict(
19+
zip(list(label_frequency_dict.keys()), percentage_list)
20+
)
1921
return label_percentage_dict
2022

2123

@@ -26,15 +28,17 @@ def _train_test_coloring(val):
2628
:return:
2729
"""
2830
if val > 25:
29-
color = 'red'
31+
color = "red"
3032
elif val > 10:
31-
color = 'DarkBlue'
33+
color = "DarkBlue"
3234
else:
33-
color = 'green'
34-
return 'color: %s' % color
35+
color = "green"
36+
return "color: %s" % color
3537

3638

37-
def _train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict):
39+
def _train_test_label_difference(
40+
workspace_label_percentage_dict, test_label_percentage_dict
41+
):
3842
"""
3943
analyze the difference between training set and test set
4044
:param workspace_label_percentage_dict:
@@ -66,9 +70,11 @@ def _train_test_label_difference(workspace_label_percentage_dict, test_label_per
6670
current_difference = np.abs(test_percentage - workspace_percentage)
6771

6872
if key in test_label_percentage_dict:
69-
difference_dict[key] = [workspace_percentage*100,
70-
test_percentage*100,
71-
current_difference*100]
73+
difference_dict[key] = [
74+
workspace_percentage * 100,
75+
test_percentage * 100,
76+
current_difference * 100,
77+
]
7278

7379
js_distance = distance.jensenshannon(distribution1, distribution2, 2.0)
7480

@@ -86,8 +92,8 @@ def _train_test_vocab_difference(train_set_pd, test_set_pd):
8692
"""
8793
train_vocab = set()
8894
test_vocab = set()
89-
train_set_tokens = train_set_pd['utterance'].apply(word_tokenize)
90-
test_set_tokens = test_set_pd['utterance'].apply(word_tokenize)
95+
train_set_tokens = train_set_pd["utterance"].apply(word_tokenize)
96+
test_set_tokens = test_set_pd["utterance"].apply(word_tokenize)
9197

9298
for tokens in train_set_tokens.tolist():
9399
train_vocab.update(tokens)
@@ -107,24 +113,26 @@ def _train_test_utterance_length_difference(train_set_pd, test_set_pd):
107113
train_test_legnth_comparison: pandas dataframe [Intent, Absolute Difference]
108114
"""
109115
train_pd_temp = train_set_pd.copy()
110-
train_pd_temp['tokens'] = train_set_pd['utterance'].apply(word_tokenize)
111-
train_pd_temp['Train'] = train_pd_temp['tokens'].apply(len)
112-
train_avg_len_by_label = train_pd_temp[['intent', 'Train']].groupby('intent').mean()
116+
train_pd_temp["tokens"] = train_set_pd["utterance"].apply(word_tokenize)
117+
train_pd_temp["Train"] = train_pd_temp["tokens"].apply(len)
118+
train_avg_len_by_label = train_pd_temp[["intent", "Train"]].groupby("intent").mean()
113119

114120
test_pd_temp = test_set_pd.copy()
115-
test_pd_temp['tokens'] = test_set_pd['utterance'].apply(word_tokenize)
116-
test_pd_temp['Test'] = test_pd_temp['tokens'].apply(len)
117-
test_avg_len_by_label = test_pd_temp[['intent', 'Test']].groupby('intent').mean()
118-
119-
train_test_length_comparison = pd.merge(train_avg_len_by_label,
120-
test_avg_len_by_label, on='intent')
121-
train_test_length_comparison['Absolute Difference'] = \
122-
np.abs(train_test_length_comparison['Train'] - train_test_length_comparison['Test'])
121+
test_pd_temp["tokens"] = test_set_pd["utterance"].apply(word_tokenize)
122+
test_pd_temp["Test"] = test_pd_temp["tokens"].apply(len)
123+
test_avg_len_by_label = test_pd_temp[["intent", "Test"]].groupby("intent").mean()
124+
125+
train_test_length_comparison = pd.merge(
126+
train_avg_len_by_label, test_avg_len_by_label, on="intent"
127+
)
128+
train_test_length_comparison["Absolute Difference"] = np.abs(
129+
train_test_length_comparison["Train"] - train_test_length_comparison["Test"]
130+
)
123131
train_test_length_comparison = train_test_length_comparison.sort_values(
124-
by=["Absolute Difference"], ascending=False)
132+
by=["Absolute Difference"], ascending=False
133+
)
125134
train_test_length_comparison = train_test_length_comparison.reset_index()
126-
train_test_length_comparison.rename(columns={'intent':'Intent'
127-
}, inplace=True)
135+
train_test_length_comparison.rename(columns={"intent": "Intent"}, inplace=True)
128136
return train_test_length_comparison
129137

130138

@@ -137,8 +145,8 @@ def _get_metrics(results):
137145
recall_dict: maps the {intent: recall}
138146
f1_dict: maps the {intent:f1}
139147
"""
140-
groundtruth = results['correct_intent'].values.tolist()
141-
top_intent = results['top_intent'].values.tolist()
148+
groundtruth = results["correct_intent"].values.tolist()
149+
top_intent = results["top_intent"].values.tolist()
142150
gt_cnt_dict = dict()
143151
pred_cnt_dict = dict()
144152
true_positive_dict = dict()
@@ -152,13 +160,22 @@ def _get_metrics(results):
152160
f1_dict = dict()
153161
for lb in true_positive_dict:
154162

155-
recall_dict[lb] = true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0
156-
157-
precision_dict[lb] = true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict \
158-
else 0
159-
160-
f1_dict[lb] = 0.0 if recall_dict[lb] == 0 and precision_dict[lb] == 0 \
161-
else 2.0 * recall_dict[lb] * precision_dict[lb] / (recall_dict[lb] + precision_dict[lb])
163+
recall_dict[lb] = (
164+
true_positive_dict[lb] / gt_cnt_dict[lb] if lb in gt_cnt_dict else 0
165+
)
166+
167+
precision_dict[lb] = (
168+
true_positive_dict[lb] / pred_cnt_dict[lb] if lb in pred_cnt_dict else 0
169+
)
170+
171+
f1_dict[lb] = (
172+
0.0
173+
if recall_dict[lb] == 0 and precision_dict[lb] == 0
174+
else 2.0
175+
* recall_dict[lb]
176+
* precision_dict[lb]
177+
/ (recall_dict[lb] + precision_dict[lb])
178+
)
162179
return precision_dict, recall_dict, f1_dict
163180

164181

@@ -172,12 +189,14 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
172189
workspace_label_percentage_dict = _label_percentage(train_set_pd)
173190
test_label_percentage_dict = _label_percentage(test_set_pd)
174191

175-
missing_label, difference_dict, js = \
176-
_train_test_label_difference(workspace_label_percentage_dict, test_label_percentage_dict)
192+
missing_label, difference_dict, js = _train_test_label_difference(
193+
workspace_label_percentage_dict, test_label_percentage_dict
194+
)
177195
train_vocab, test_vocab = _train_test_vocab_difference(train_set_pd, test_set_pd)
178196

179-
train_test_length_comparison_pd = \
180-
_train_test_utterance_length_difference(train_set_pd, test_set_pd)
197+
train_test_length_comparison_pd = _train_test_utterance_length_difference(
198+
train_set_pd, test_set_pd
199+
)
181200

182201
display(Markdown("## Test Data Evaluation"))
183202

@@ -186,35 +205,43 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
186205
label = list(difference_dict.keys())
187206
diff = np.round(list(difference_dict.values()), 2)
188207
precision_dict, recall_dict, f1_dict = _get_metrics(results)
189-
precision = np.round([precision_dict[l]*100.0 if l in precision_dict else 0.0
190-
for l in label], 2)
208+
precision = np.round(
209+
[precision_dict[l] * 100.0 if l in precision_dict else 0.0 for l in label],
210+
2,
211+
)
191212

192-
recall = np.round([recall_dict[l]*100.0 if l in recall_dict else 0.0 for l in label], 2)
213+
recall = np.round(
214+
[recall_dict[l] * 100.0 if l in recall_dict else 0.0 for l in label], 2
215+
)
193216

194-
f1 = np.round([f1_dict[l]*100.0 if l in f1_dict else 0.0 for l in label], 2)
217+
f1 = np.round([f1_dict[l] * 100.0 if l in f1_dict else 0.0 for l in label], 2)
195218

196-
train_count_dict = dict(Counter(train_set_pd['intent']))
197-
test_count_dict = dict(Counter(test_set_pd['intent']))
219+
train_count_dict = dict(Counter(train_set_pd["intent"]))
220+
test_count_dict = dict(Counter(test_set_pd["intent"]))
198221
tr_cnt = [train_count_dict[l] if l in train_count_dict else 0.0 for l in label]
199222
te_cnt = [test_count_dict[l] if l in test_count_dict else 0.0 for l in label]
200223

201-
difference_pd = pd.DataFrame({"Intent": label,
202-
"% of Train": diff[:, 0],
203-
"% of Test": diff[:, 1],
204-
"Absolute Difference %": diff[:, 2],
205-
"Train Examples": tr_cnt,
206-
"Test Examples": te_cnt,
207-
"Test Precision %": precision,
208-
"Test Recall %": recall,
209-
"Test F1 %": f1})
210-
211-
if not difference_pd[difference_pd["Absolute Difference %"] > .001].empty:
212-
table_for_display = difference_pd[difference_pd["Absolute Difference %"]
213-
> .001].sort_values(by=["Absolute Difference %"],
214-
ascending=False)
215-
table_for_display = \
216-
table_for_display.style.applymap(_train_test_coloring,
217-
subset=pd.IndexSlice[:, ["Absolute Difference %"]])
224+
difference_pd = pd.DataFrame(
225+
{
226+
"Intent": label,
227+
"% of Train": diff[:, 0],
228+
"% of Test": diff[:, 1],
229+
"Absolute Difference %": diff[:, 2],
230+
"Train Examples": tr_cnt,
231+
"Test Examples": te_cnt,
232+
"Test Precision %": precision,
233+
"Test Recall %": recall,
234+
"Test F1 %": f1,
235+
}
236+
)
237+
238+
if not difference_pd[difference_pd["Absolute Difference %"] > 0.001].empty:
239+
table_for_display = difference_pd[
240+
difference_pd["Absolute Difference %"] > 0.001
241+
].sort_values(by=["Absolute Difference %"], ascending=False)
242+
table_for_display = table_for_display.style.applymap(
243+
_train_test_coloring, subset=pd.IndexSlice[:, ["Absolute Difference %"]]
244+
)
218245
display(table_for_display)
219246
display(Markdown("\n"))
220247
display(Markdown("Distribution Mismatch Color Code"))
@@ -223,42 +250,61 @@ def analyze_train_test_diff(train_set_pd, test_set_pd, results):
223250
display(Markdown("<font color = 'green'> Green - Good </font>"))
224251

225252
if js >= 0:
226-
js = np.round(js, 2)*100
227-
display(Markdown("### Data Distribution Divergence Test vs Train \
228-
<font color='blue'>{}%</font>" .format(js)))
253+
js = np.round(js, 2) * 100
254+
display(
255+
Markdown(
256+
"### Data Distribution Divergence Test vs Train \
257+
<font color='blue'>{}%</font>".format(
258+
js
259+
)
260+
)
261+
)
229262
display(Markdown("**Note** Metric used is Jensen Shannon Distance"))
230263

231264
if missing_label:
232265
display(Markdown("### Missing Intents in Test Data"))
233-
missing_label_pd = pd.DataFrame(missing_label,
234-
columns=["Missing Intents in Test Set "])
235-
missing_label_pd.index = np.arange(1, len(missing_label_pd)+1)
266+
missing_label_pd = pd.DataFrame(
267+
missing_label, columns=["Missing Intents in Test Set "]
268+
)
269+
missing_label_pd.index = np.arange(1, len(missing_label_pd) + 1)
236270
display(missing_label_pd)
237271

238272
display(Markdown("### Test Data Example Length"))
239-
condition1 = (train_test_length_comparison_pd["Absolute Difference"] /
240-
train_test_length_comparison_pd["Train"] > .3)
241-
condition2 = (train_test_length_comparison_pd["Absolute Difference"] > 3)
273+
condition1 = (
274+
train_test_length_comparison_pd["Absolute Difference"]
275+
/ train_test_length_comparison_pd["Train"]
276+
> 0.3
277+
)
278+
condition2 = train_test_length_comparison_pd["Absolute Difference"] > 3
242279

243280
length_comparison_pd = train_test_length_comparison_pd[condition1 & condition2]
244281

245282
if not length_comparison_pd.empty:
246-
display(Markdown(
247-
"Divergence found in average length of user examples in test vs training data"))
248-
length_comparison_pd.index = np.arange(1, len(length_comparison_pd)+1)
283+
display(
284+
Markdown(
285+
"Divergence found in average length of user examples in test vs training data"
286+
)
287+
)
288+
length_comparison_pd.index = np.arange(1, len(length_comparison_pd) + 1)
249289
display(length_comparison_pd.round(2))
250290
else:
251291
display(Markdown("Average length of user examples is comparable"))
252292

253293
if train_vocab and test_vocab:
254294
display(Markdown("### Vocabulary Size Test vs Train"))
255-
oov_vocab_percentage = (len(test_vocab) - len(train_vocab.intersection(test_vocab))) \
256-
/ len(test_vocab)*100
257-
258-
vocab_df = pd.DataFrame(data={
259-
'Train Vocabulary Size': [len(train_vocab)],
260-
'Test Vocabulary Size': [len(test_vocab)],
261-
'% Test Set Vocabulary not found in Train': [oov_vocab_percentage]})
295+
oov_vocab_percentage = (
296+
(len(test_vocab) - len(train_vocab.intersection(test_vocab)))
297+
/ len(test_vocab)
298+
* 100
299+
)
300+
301+
vocab_df = pd.DataFrame(
302+
data={
303+
"Train Vocabulary Size": [len(train_vocab)],
304+
"Test Vocabulary Size": [len(test_vocab)],
305+
"% Test Set Vocabulary not found in Train": [oov_vocab_percentage],
306+
}
307+
)
262308
vocab_df.index = np.arange(1, len(vocab_df) + 1)
263309
display(vocab_df.round(2))
264310

assistant_dialog_skill_analysis/data_analysis/similarity_analyzer.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from IPython.display import display, Markdown, HTML
66

77

8-
def ambiguous_examples_analysis(workspace_pd, threshold=.7):
8+
def ambiguous_examples_analysis(workspace_pd, threshold=0.7):
99
"""
1010
Analyze the test workspace and find out similar utterances that belongs to different intent
1111
:param workspace_pd: pandas dataframe in format of [utterance,label]
@@ -15,31 +15,49 @@ def ambiguous_examples_analysis(workspace_pd, threshold=.7):
1515
"""
1616
# first create the feature matrix
1717
vectorizer = CountVectorizer(ngram_range=(1, 2))
18-
workspace_bow = vectorizer.fit_transform(workspace_pd['utterance']).todense()
18+
workspace_bow = vectorizer.fit_transform(workspace_pd["utterance"]).todense()
1919
cos_sim_score_matrix = _calculate_cosine_similarity(workspace_bow)
2020

2121
# remove the lower triangle of the matrix and apply threshold
22-
similar_utterance_index = np.argwhere((cos_sim_score_matrix - np.tril(cos_sim_score_matrix))
23-
> threshold)
24-
similar_utterance_pd = pd.DataFrame(columns=['Intent1', 'Utterance1', 'Intent2', 'Utterance2',
25-
'similarity score'])
22+
similar_utterance_index = np.argwhere(
23+
(cos_sim_score_matrix - np.tril(cos_sim_score_matrix)) > threshold
24+
)
25+
similar_utterance_pd = pd.DataFrame(
26+
columns=["Intent1", "Utterance1", "Intent2", "Utterance2", "similarity score"]
27+
)
2628

2729
for index in similar_utterance_index:
28-
if workspace_pd['intent'].iloc[index[0]] != workspace_pd['intent'].iloc[index[1]]:
29-
intent1 = workspace_pd['intent'].iloc[index[0]]
30-
utterance1 = workspace_pd['utterance'].iloc[index[0]]
31-
intent2 = workspace_pd['intent'].iloc[index[1]]
32-
utterance2 = workspace_pd['utterance'].iloc[index[1]]
30+
if (
31+
workspace_pd["intent"].iloc[index[0]]
32+
!= workspace_pd["intent"].iloc[index[1]]
33+
):
34+
intent1 = workspace_pd["intent"].iloc[index[0]]
35+
utterance1 = workspace_pd["utterance"].iloc[index[0]]
36+
intent2 = workspace_pd["intent"].iloc[index[1]]
37+
utterance2 = workspace_pd["utterance"].iloc[index[1]]
3338
score = cos_sim_score_matrix[index[0], index[1]]
3439
temp_pd = pd.DataFrame(
35-
{'Intent1': [intent1], 'Utterance1': [utterance1], 'Intent2': [intent2],
36-
'Utterance2': [utterance2], 'similarity score': [score]})
37-
similar_utterance_pd = similar_utterance_pd.append(temp_pd, ignore_index=True)
40+
{
41+
"Intent1": [intent1],
42+
"Utterance1": [utterance1],
43+
"Intent2": [intent2],
44+
"Utterance2": [utterance2],
45+
"similarity score": [score],
46+
}
47+
)
48+
similar_utterance_pd = similar_utterance_pd.append(
49+
temp_pd, ignore_index=True
50+
)
3851

3952
if not similar_utterance_pd.empty:
40-
with pd.option_context('max_colwidth', 250):
41-
display(HTML(similar_utterance_pd.sort_values(by=['similarity score'],
42-
ascending=False).to_html(index=False)))
53+
with pd.option_context("max_colwidth", 250):
54+
display(
55+
HTML(
56+
similar_utterance_pd.sort_values(
57+
by=["similarity score"], ascending=False
58+
).to_html(index=False)
59+
)
60+
)
4361
else:
4462
display(Markdown("### There are no similar utterances within different Intent"))
4563

0 commit comments

Comments
 (0)