-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess.py
More file actions
44 lines (42 loc) · 1.24 KB
/
preprocess.py
File metadata and controls
44 lines (42 loc) · 1.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import spacy, pandas
from spacy.tokens import DocBin
raw = pandas.read_csv(
"Training-phrases-generation/detergent_training_phrases_index.csv")
# print(raw.columns)
# print(raw.duplicated().sum())
# print(raw.loc[raw.duplicated(), :])
# print(len(s))
raw.drop_duplicates(subset=None, inplace=True, ignore_index=True)
# print(raw2.count())
# print(raw2.duplicated().sum())
# print(raw2.loc[raw.duplicated(), :])
# t = raw['Size']
# k = t[0].strip('[').strip(']').split(',')
# print(int(k[0]))
# l = raw['Training Phrase'].to_list()
# print(l[:417])
# print(raw['Training Phrase'][417])
trainingData = []
index = 0
label = ["Product", "Quantity", "Size"]
for i in raw['Training Phrase']:
l = []
for j in label:
k = raw[j][index].strip('[').strip(']').split(',')
l.append((int(k[0]), int(k[1]), j.upper()))
# print(i, j.upper(), int(k[0]), int(k[1]))
trainingData.append((i, l))
index += 1
# for i in trainingData:
# print(i)
nlp = spacy.blank('en')
db = DocBin()
for text, annotation in trainingData:
doc = nlp(text)
ents = []
for start, end, label in annotation:
span = doc.char_span(start, end, label=label)
ents.append(span)
doc.ents = ents
db.add(doc)
db.to_disk('training/train.spacy')