Skip to content

Commit 1a25003

Browse files
committed
[SPARKNLP-1244] NerDLGraphChecker
It can check the required graph params before running any computations
1 parent f4df856 commit 1a25003

File tree

9 files changed

+1250
-0
lines changed

9 files changed

+1250
-0
lines changed
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
{%- capture title -%}
2+
NerDLGraphChecker
3+
{%- endcapture -%}
4+
5+
{%- capture description -%}
6+
Checks whether a suitable NerDL graph is available for the given training dataset, before any
7+
computations/training is done. This annotator is useful for custom training cases, where
8+
specialized graphs might not be available and we want to check before embeddings are evaluated.
9+
10+
Important: This annotator should be used or positioned before any embedding or NerDLApproach
11+
annotators in the pipeline and will process the whole dataset to extract the required graph parameters.
12+
13+
This annotator requires a dataset with at least two columns: one with tokens and one with the
14+
labels. In addition, it requires the used embedding annotator in the pipeline to extract the
15+
suitable embedding dimension.
16+
17+
For extended examples of usage, see the
18+
[example notebook](https://github.com/JohnSnowLabs/spark-nlp/blob/master//home/ducha/Workspace/scala/spark-nlp-feature/examples/python/training/english/dl-ner/ner_dl_graph_checker.ipynb)
19+
and the
20+
[NerDLGraphCheckerTestSpec](https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLGraphCheckerTestSpec.scala).
21+
22+
{%- endcapture -%}
23+
24+
{%- capture input_anno -%}
25+
DOCUMENT, TOKEN
26+
{%- endcapture -%}
27+
28+
{%- capture output_anno -%}
29+
NONE
30+
{%- endcapture -%}
31+
32+
{%- capture python_example -%}
33+
import sparknlp
34+
from sparknlp.base import *
35+
from sparknlp.annotator import*
36+
from pyspark.ml import Pipeline
37+
conll = CoNLL()
38+
trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
39+
embeddings = BertEmbeddings \
40+
.pretrained() \
41+
.setInputCols(["sentence", "token"]) \
42+
.setOutputCol("embeddings")
43+
nerDLGraphChecker = NerDLGraphChecker() \
44+
.setInputCols(["sentence", "token"]) \
45+
.setLabelColumn("label") \
46+
.setEmbeddingsModel(embeddings)
47+
nerTagger = NerDLApproach() \
48+
.setInputCols(["sentence", "token", "embeddings"]) \
49+
.setLabelColumn("label") \
50+
.setOutputCol("ner") \
51+
.setMaxEpochs(1) \
52+
.setRandomSeed(0) \
53+
.setVerbose(0)
54+
pipeline = Pipeline().setStages([nerDLGraphChecker, embeddings, nerTagger])
55+
# will throw an exception if no suitable graph found
56+
pipelineModel = pipeline.fit(trainingData)
57+
{%- endcapture -%}
58+
59+
{%- capture scala_example -%}
60+
import com.johnsnowlabs.nlp.annotator._
61+
import com.johnsnowlabs.nlp.training.CoNLL
62+
import org.apache.spark.ml.Pipeline
63+
64+
// This CoNLL dataset already includes a sentence, token and label
65+
// column with their respective annotator types. If a custom dataset is used,
66+
// these need to be defined with for example:
67+
val conll = CoNLL()
68+
val trainingData = conll.readDataset(spark, "src/test/resources/conll2003/eng.train")
69+
70+
val embeddings = BertEmbeddings
71+
.pretrained()
72+
.setInputCols("sentence", "token")
73+
.setOutputCol("embeddings")
74+
75+
// Requires the data for NerDLApproach graphs: text, tokens, labels and the embedding model
76+
val nerDLGraphChecker = new NerDLGraphChecker()
77+
.setInputCols("sentence", "token")
78+
.setLabelColumn("label")
79+
.setEmbeddingsModel(embeddings)
80+
81+
val nerTagger = new NerDLApproach()
82+
.setInputCols("sentence", "token", "embeddings")
83+
.setLabelColumn("label")
84+
.setOutputCol("ner")
85+
.setMaxEpochs(1)
86+
.setRandomSeed(0)
87+
.setVerbose(0)
88+
89+
val pipeline = new Pipeline().setStages(
90+
Array(nerDLGraphChecker, embeddings, nerTagger))
91+
92+
// Will throw an exception if no suitable graph is found
93+
val pipelineModel = pipeline.fit(trainingData)
94+
{%- endcapture -%}
95+
96+
{%- capture api_link -%}
97+
[NerDLGraphChecker](/api/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLGraphChecker)
98+
{%- endcapture -%}
99+
100+
{%- capture python_api_link -%}
101+
[NerDLGraphChecker](/api/python/reference/autosummary/sparknlp/annotator/ner/ner_dl_graph_checker/index.html#sparknlp.annotator.ner.ner_dl_graph_checker.NerDLGraphChecker)
102+
{%- endcapture -%}
103+
104+
{%- capture source_link -%}
105+
[NerDLGraphChecker](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLGraphChecker.scala)
106+
{%- endcapture -%}
107+
108+
{% include templates/anno_template.md
109+
title=title
110+
description=description
111+
input_anno=input_anno
112+
output_anno=output_anno
113+
python_example=python_example
114+
scala_example=scala_example
115+
api_link=api_link
116+
python_api_link=python_api_link
117+
source_link=source_link
118+
%}

docs/en/annotators.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ There are two types of Annotators:
8282
{% include templates/anno_table_entry.md path="" name="NerConverter" summary="Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label."%}
8383
{% include templates/anno_table_entry.md path="" name="NerCrf" summary="Extracts Named Entities based on a CRF Model."%}
8484
{% include templates/anno_table_entry.md path="" name="NerDL" summary="This Named Entity recognition annotator is a generic NER model based on Neural Networks."%}
85+
{% include templates/anno_table_entry.md path="" name="NerDLGraphChecker" summary="Checks whether a suitable NerDL graph is available for the given training dataset, before any
86+
computations/training is done."%}
8587
{% include templates/anno_table_entry.md path="" name="NerOverwriter" summary="Overwrites entities of specified strings."%}
8688
{% include templates/anno_table_entry.md path="" name="Normalizer" summary="Removes all dirty characters from text following a regex pattern and transforms words based on a provided dictionary."%}
8789
{% include templates/anno_table_entry.md path="" name="NorvigSweeting Spellchecker" summary="Retrieves tokens and makes corrections automatically if not found in an English dictionary."%}

0 commit comments

Comments
 (0)