|
3 | 3 |
|
4 | 4 | For machine translation, we use EasyNMT: https://github.com/UKPLab/EasyNMT |
5 | 5 | You can install it via: pip install easynmt |
| 6 | +
|
| 7 | +Usage: |
| 8 | +python translate_queries [target_language] |
6 | 9 | """ |
7 | 10 | import os |
8 | 11 | from sentence_transformers import LoggingHandler, util |
9 | 12 | import logging |
10 | 13 | import tarfile |
11 | 14 | from easynmt import EasyNMT |
| 15 | +import sys |
12 | 16 |
|
13 | 17 | #### Just some code to print debug information to stdout |
14 | 18 | logging.basicConfig(format='%(asctime)s - %(message)s', |
|
17 | 21 | handlers=[LoggingHandler()]) |
18 | 22 | #### /print debug information to stdout |
19 | 23 |
|
20 | | -target_lang = 'de' |
| 24 | +target_lang = sys.argv[1] |
21 | 25 | output_folder = 'multilingual-data' |
| 26 | +data_folder = '../msmarco-data' |
| 27 | + |
22 | 28 | output_filename = os.path.join(output_folder, 'train_queries.en-{}.tsv'.format(target_lang)) |
23 | 29 | os.makedirs(output_folder, exist_ok=True) |
24 | 30 |
|
|
32 | 38 | translated_qids.add(splits[0]) |
33 | 39 |
|
34 | 40 | ### Now we read the MS Marco dataset |
35 | | -data_folder = '../msmarco-data' |
36 | 41 | os.makedirs(data_folder, exist_ok=True) |
37 | 42 |
|
38 | 43 | # Read qrels file for relevant positives per query |
|
78 | 83 |
|
79 | 84 | with open(output_filename, 'a' if os.path.exists(output_filename) else 'w', encoding='utf8') as fOut: |
80 | 85 | for qid, query, translated_query in zip(qids, queries, translation_model.translate_stream(queries, source_lang='en', target_lang=target_lang, beam_size=2, perform_sentence_splitting=False, chunk_size=256, batch_size=64)): |
81 | | - fOut.write("{}\t{}\t{}\n".format(qid, query.replace("\t", " "), translated_query.replace("\t", " "))) |
| 86 | + fOut.write("{}\t{}\t{}\n".format(qid, translated_query.replace("\t", " "))) |
82 | 87 | fOut.flush() |
0 commit comments