Commit 4b3685d8 authored by aj7tesh's avatar aj7tesh
Browse files

Merge branch 'experiments' into release_anuvaad

No related merge requests found
Showing with 87 additions and 12 deletions
+87 -12
......@@ -375,26 +375,28 @@
},
{
"id": 57,
"model": "model_enBeng-en-to-beng-2_2020-02-18-model_step_200000.pt",
"model": "model_enBeng-en-to-bn-3.2_2020-12-28-model_step_190000.pt",
"timeout": 600,
"on_timeout": "to_cpu",
"load": true,
"opt": {
"beam_size": 5,
"log_file":"log_file.txt",
"log_file_level":"INFO"
"log_file_level":"INFO",
"gpu":0
}
},
{
"id": 58,
"model": "model_enBeng-beng-to-en-1_2020-02-18-model_step_200000.pt",
"model": "model_enBeng-bn-to-en-2.2_2020-12-28-model_step_180000.pt",
"timeout": 600,
"on_timeout": "to_cpu",
"load": true,
"opt": {
"beam_size": 5,
"log_file":"log_file.txt",
"log_file_level":"INFO"
"log_file_level":"INFO",
"gpu":0
}
},
{
......
......@@ -43,6 +43,10 @@ english_bengali = {
"BENG_120919": "model/sentencepiece_models/beng-2019-09-12-10k.model",
"ENG_180220": "model/sentencepiece_models/enBeng-en-to-beng-2-2020-02-18-24k.model",
"BENG_180220": "model/sentencepiece_models/bengali-en-to-beng-2-2020-02-18-24k.model",
"ENG_281220": "model/sentencepiece_models/enBeng-en-to-bn-3.2-2020-12-28-24k.model",
"BENG_281220": "model/sentencepiece_models/bengali-en-to-bn-3.2-2020-12-28-24k.model",
"ENG_281220_2.2": "model/sentencepiece_models/enBeng-bn-to-en-2.2-2020-12-28-24k.model",
"BENG_281220_2.2": "model/sentencepiece_models/bengali-bn-to-en-2.2-2020-12-28-24k.model",
}
english_marathi = {
......
import csv
def token_finder(input_file,token):
try:
out_file = "file_with_{}".format(token)
......@@ -63,7 +65,74 @@ def get_indices_for_same_lines_among_files(file_1,file_2):
print(e)
index_array = token_finder("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt","" )
def token_list_finder(input_file,token,outfile,no_match_tokens,one_match_tokens):
try:
# out_file = "file_with_{}".format("token_list")
sent_count = 0
line_numbers = list()
# outfile = open("{0}".format(out_file), "w")
with open(input_file) as input_file:
for num,line in enumerate(input_file,1):
if any(v in line for v in [token, token.lower(),token.title()]):
outfile.write(line)
sent_count = sent_count +1
if sent_count == 2:
return
if sent_count == 0:
no_match_tokens.append(token)
# line_numbers.append(num)
if sent_count == 1:
one_match_tokens.append(token)
# outfile.close()
# print(line_numbers)
return None
except Exception as e:
print("exception: ",e)
def create_tmx_data(tokens):
one_match_tokens = list ()
no_match_tokens = list()
out_file = "file_with_{}".format("token_list")
outfile = open("{0}".format(out_file), "w")
tokens_list = tokens
for token in tokens_list:
token_list_finder("corpus/original_data/english_bengali/final_bn_source.txt",token,outfile,no_match_tokens,one_match_tokens)
print("all done")
print(len(no_match_tokens))
print("final no match: ",no_match_tokens)
print(len(one_match_tokens))
print(one_match_tokens)
def csv_to_list():
token_list = list()
with open('corpus/tmx-english.csv', 'r') as fd:
reader = csv.reader(fd)
for row in reader:
token_list.append(str(row[0]))
# print(token_list)
print(len(token_list))
print("ddone")
return token_list
tokens = csv_to_list()
create_tmx_data(tokens)
def drop_duplicate(inFile,outFile):
lines_seen = set() # holds lines already seen
outfile = open("{0}".format(outFile), "w")
for line in open("{0}".format(inFile), "r"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
# drop_duplicate("file_with_token_list","file_with_token_list_nodup-2")
# index_array = token_finder("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt","" )
# line_extracter_using_index("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt",index_array,"test_en.txt")
line_extracter_using_index("corpus/master_corpus/english_hindi/hindi_train_corpus_final.txt-1-1",index_array,"test.txt")
# line_extracter_using_index("corpus/master_corpus/english_hindi/hindi_train_corpus_final.txt-1-1",index_array,"test.txt")
# lines_without_token("test3.txt","ADV","test4.txt")
......@@ -353,14 +353,14 @@ def translate_func(inputs, translation_server):
translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_punjabi["PUNJABI_160220"],sp_model.english_punjabi["ENG_160220"])
translation = sentence_processor.moses_detokenizer(translation)
elif i['id'] == 57:
"en-bengali 2nd"
"en-bengali 3rd"
i['src'] = sentence_processor.moses_tokenizer(i['src'])
translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["ENG_180220"],sp_model.english_bengali["BENG_180220"])
translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["ENG_281220"],sp_model.english_bengali["BENG_281220"])
translation = sentence_processor.indic_detokenizer(translation)
elif i['id'] == 58:
"bengali-en 1st"
"bengali-en 2nd"
i['src'] = sentence_processor.indic_tokenizer(i['src'])
translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["BENG_180220"],sp_model.english_bengali["ENG_180220"])
translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["BENG_281220_2.2"],sp_model.english_bengali["ENG_281220_2.2"])
translation = sentence_processor.moses_detokenizer(translation)
elif i['id'] == 59:
"en-malay 2nd"
......
......@@ -26,11 +26,11 @@ def onmt_train(f_in):
os.system('rm -f {0} {1} {2} {3}'.format(f_in['train_src'],f_in['train_tgt'],f_in['valid_src'],f_in['valid_tgt']))
logger.info("removed files, starting training for epoch:{}".format(f_in['epoch']))
os.system('python train.py -data {0} -save_model {1} -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 \
os.system('nohup python train.py -data {0} -save_model {1} -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 \
-encoder_type transformer -decoder_type transformer -position_encoding -train_steps {2} -max_generator_batches 2 -dropout 0.1 \
-batch_size 6000 -batch_type tokens -normalization tokens -accum_count 2 -optim adam -adam_beta2 0.998 -decay_method noam \
-warmup_steps 8000 -learning_rate 0.25 -max_grad_norm 0 -param_init 0 -param_init_glorot -label_smoothing 0.1 -valid_steps 10000 \
-save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0'.format(f_in['nmt_processed_data'],f_in['nmt_model_path'],f_in['epoch']))
-save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0 &'.format(f_in['nmt_processed_data'],f_in['nmt_model_path'],f_in['epoch']))
except Exception as e:
logger.error("error in onmt_train utils-anuvaad script: {}".format(e))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment