Merge branch 'experiments' into release_anuvaad

4b3685d8 · aj7tesh · 08daef5e · a33fafcd · 4b3685d8 · 4b3685d8
Commit 4b3685d8 authored 4 years ago by aj7tesh
Hide whitespace changes
Inline Side-by-side

Showing

with 87 additions and 12 deletions
+87 -12
--- a/available_models/conf.json
+++ b/available_models/conf.json
@@ -375,26 +375,28 @@
        },
        {
            "id": 57,
-            "model": "model_enBeng-en-to-beng-2_2020-02-18-model_step_200000.pt",
+            "model": "model_enBeng-en-to-bn-3.2_2020-12-28-model_step_190000.pt",
            "timeout": 600,
            "on_timeout": "to_cpu",
            "load": true,
            "opt": {
                "beam_size": 5,
                "log_file":"log_file.txt",
-                "log_file_level":"INFO"
+                "log_file_level":"INFO",
+                "gpu":0
            }
        },
        {
            "id": 58,
-            "model": "model_enBeng-beng-to-en-1_2020-02-18-model_step_200000.pt",
+            "model": "model_enBeng-bn-to-en-2.2_2020-12-28-model_step_180000.pt",
            "timeout": 600,
            "on_timeout": "to_cpu",
            "load": true,
            "opt": {
                "beam_size": 5,
                "log_file":"log_file.txt",
-                "log_file_level":"INFO"
+                "log_file_level":"INFO",
+                "gpu":0
            }
        },
        {

--- a/config/sentencepiece_model_loc.py
+++ b/config/sentencepiece_model_loc.py
@@ -43,6 +43,10 @@ english_bengali = {
    "BENG_120919": "model/sentencepiece_models/beng-2019-09-12-10k.model",
    "ENG_180220": "model/sentencepiece_models/enBeng-en-to-beng-2-2020-02-18-24k.model",
    "BENG_180220": "model/sentencepiece_models/bengali-en-to-beng-2-2020-02-18-24k.model",
+    "ENG_281220": "model/sentencepiece_models/enBeng-en-to-bn-3.2-2020-12-28-24k.model",
+    "BENG_281220": "model/sentencepiece_models/bengali-en-to-bn-3.2-2020-12-28-24k.model",
+    "ENG_281220_2.2": "model/sentencepiece_models/enBeng-bn-to-en-2.2-2020-12-28-24k.model",
+    "BENG_281220_2.2": "model/sentencepiece_models/bengali-bn-to-en-2.2-2020-12-28-24k.model",
 }

 english_marathi = {

--- a/corpus/helper_functions/token_manipulator.py
+++ b/corpus/helper_functions/token_manipulator.py
+import csv
+
 def token_finder(input_file,token):
    try:
        out_file = "file_with_{}".format(token)
@@ -63,7 +65,74 @@ def get_indices_for_same_lines_among_files(file_1,file_2):
        print(e)
    

-index_array = token_finder("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt","" )
+def token_list_finder(input_file,token,outfile,no_match_tokens,one_match_tokens):
+    try:
+        # out_file = "file_with_{}".format("token_list")
+        sent_count = 0
+        line_numbers = list()
+        # outfile = open("{0}".format(out_file), "w")
+        with open(input_file) as input_file:
+            for num,line in enumerate(input_file,1):
+                if any(v in line for v in [token, token.lower(),token.title()]):
+                    outfile.write(line)
+                    sent_count = sent_count +1
+                if sent_count == 2:
+                    return     
+            if sent_count == 0:
+                no_match_tokens.append(token)    
+                    # line_numbers.append(num)
+            if sent_count == 1:
+                one_match_tokens.append(token) 
+                        
+            # outfile.close()
+
+        # print(line_numbers)
+        return None
+    except Exception as e:
+        print("exception: ",e)
+        
+def create_tmx_data(tokens):
+    one_match_tokens = list ()
+    no_match_tokens = list()
+    out_file = "file_with_{}".format("token_list")
+    outfile = open("{0}".format(out_file), "w")
+    tokens_list = tokens
+    for token in tokens_list:
+        token_list_finder("corpus/original_data/english_bengali/final_bn_source.txt",token,outfile,no_match_tokens,one_match_tokens)
+    
+    print("all done")    
+    print(len(no_match_tokens))
+    print("final no match: ",no_match_tokens)    
+    print(len(one_match_tokens))
+    print(one_match_tokens)
+
+def csv_to_list():
+    token_list = list()
+    with open('corpus/tmx-english.csv', 'r') as fd:    
+        reader = csv.reader(fd)
+        for row in reader:
+            token_list.append(str(row[0]))
+    # print(token_list)
+    print(len(token_list))
+    print("ddone") 
+    return token_list       
+        
+tokens = csv_to_list()
+
+create_tmx_data(tokens)
+
+def drop_duplicate(inFile,outFile):
+    lines_seen = set() # holds lines already seen
+    outfile = open("{0}".format(outFile), "w")
+    for line in open("{0}".format(inFile), "r"):
+        if line not in lines_seen: # not a duplicate
+           outfile.write(line)
+           lines_seen.add(line)
+    outfile.close()
+    
+# drop_duplicate("file_with_token_list","file_with_token_list_nodup-2")
+
+# index_array = token_finder("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt","" )
 # line_extracter_using_index("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt",index_array,"test_en.txt")
-line_extracter_using_index("corpus/master_corpus/english_hindi/hindi_train_corpus_final.txt-1-1",index_array,"test.txt")
+# line_extracter_using_index("corpus/master_corpus/english_hindi/hindi_train_corpus_final.txt-1-1",index_array,"test.txt")
 # lines_without_token("test3.txt","ADV","test4.txt")
--- a/translation_util/translate_util.py
+++ b/translation_util/translate_util.py
@@ -353,14 +353,14 @@ def translate_func(inputs, translation_server):
                    translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_punjabi["PUNJABI_160220"],sp_model.english_punjabi["ENG_160220"])
                    translation = sentence_processor.moses_detokenizer(translation)
                elif i['id'] == 57:
-                    "en-bengali 2nd"
+                    "en-bengali 3rd"
                    i['src'] = sentence_processor.moses_tokenizer(i['src'])
-                    translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["ENG_180220"],sp_model.english_bengali["BENG_180220"])
+                    translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["ENG_281220"],sp_model.english_bengali["BENG_281220"])
                    translation = sentence_processor.indic_detokenizer(translation) 
                elif i['id'] == 58:
-                    "bengali-en 1st"
+                    "bengali-en 2nd"
                    i['src'] = sentence_processor.indic_tokenizer(i['src'])
-                    translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["BENG_180220"],sp_model.english_bengali["ENG_180220"])
+                    translation,scores,input_sw,output_sw = encode_translate_decode(i,translation_server,sp_model.english_bengali["BENG_281220_2.2"],sp_model.english_bengali["ENG_281220_2.2"])
                    translation = sentence_processor.moses_detokenizer(translation)
                elif i['id'] == 59:
                    "en-malay 2nd"

--- a/utils/training_utils/onmt_utils.py
+++ b/utils/training_utils/onmt_utils.py
@@ -26,11 +26,11 @@ def onmt_train(f_in):
        os.system('rm -f {0} {1} {2} {3}'.format(f_in['train_src'],f_in['train_tgt'],f_in['valid_src'],f_in['valid_tgt']))
        logger.info("removed files, starting training for epoch:{}".format(f_in['epoch']))
        
-        os.system('python train.py -data {0} -save_model {1} -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 \
+        os.system('nohup python train.py -data {0} -save_model {1} -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8 \
                  -encoder_type transformer -decoder_type transformer -position_encoding -train_steps {2} -max_generator_batches 2 -dropout 0.1  \
                  -batch_size 6000 -batch_type tokens -normalization tokens  -accum_count 2 -optim adam -adam_beta2 0.998 -decay_method noam \
                  -warmup_steps 8000 -learning_rate 0.25 -max_grad_norm 0 -param_init 0  -param_init_glorot  -label_smoothing 0.1 -valid_steps 10000 \
-                  -save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0'.format(f_in['nmt_processed_data'],f_in['nmt_model_path'],f_in['epoch']))
+                  -save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0 &'.format(f_in['nmt_processed_data'],f_in['nmt_model_path'],f_in['epoch']))

    except Exception as e:
        logger.error("error in onmt_train utils-anuvaad script: {}".format(e))