Prevent empty sentences in tokenization (#114)

futurulus · tiberiu44 · commit ada1286b7fc3 · 2019-10-30T18:59:58.000+02:00
In some cases (usually involving sequences of multiple whitespace characters), the tokenizer can produce sentences with zero tokens. This causes errors later in the pipeline, specifically the following:
```
File "/usr/local/lib/python3.6/dist-packages/cube/api.py" line 194 in __call__
    sequences = self._parser.parse_sequences(sequences)
File "/usr/local/lib/python3.6/dist-packages/cube/generic_networks/parsers.py" line 496 in parse_sequences
    predicted_tags = self.tag(new_sequence)
File "/usr/local/lib/python3.6/dist-packages/cube/generic_networks/parsers.py" line 226 in tag
    arc_matrix, aux_arc_matrix, proj_labels, softmax_morphology = self._predict_arc(seq)
File "/usr/local/lib/python3.6/dist-packages/cube/generic_networks/parsers.py" line 470 in _predict_arc
    s_max = dy.softmax(dy.concatenate(s_max))
File "_dynet.pyx" line 4605 in _dynet.concatenate
File "_dynet.pyx" line 4618 in _dynet.concatenate
AssertionError: List is empty, nothing to concatenate.
```
This change removes empty sequences from the tokenization output.
diff --git a/cube/generic_networks/tokenizers.py b/cube/generic_networks/tokenizers.py
@@ -388,7 +388,8 @@ def tokenize(self, input_string):
                         if input_string[index + 1] in string.whitespace:
                             space_after_end_of_sentence = True
                     seq = self._get_tokens(w.strip(), space_after_end_of_sentence=space_after_end_of_sentence)
-                    sequences.append(seq)
+                    if seq:
+                        sequences.append(seq)
                     w = ""
                     last_ss_break = index
                 last_checked_index = index