Skip to content
GitLab
Explore
Projects
Groups
Topics
Snippets
Projects
Groups
Topics
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Register
Sign in
Toggle navigation
Menu
anuvaad
OpenNMT-py
Commits
4b3685d8
Commit
4b3685d8
authored
4 years ago
by
aj7tesh
Browse files
Options
Download
Plain Diff
Merge branch 'experiments' into release_anuvaad
parents
08daef5e
a33fafcd
release_anuvaad
No related merge requests found
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
available_models/conf.json
+6
-4
available_models/conf.json
config/sentencepiece_model_loc.py
+4
-0
config/sentencepiece_model_loc.py
corpus/helper_functions/token_manipulator.py
+71
-2
corpus/helper_functions/token_manipulator.py
translation_util/translate_util.py
+4
-4
translation_util/translate_util.py
utils/training_utils/onmt_utils.py
+2
-2
utils/training_utils/onmt_utils.py
with
87 additions
and
12 deletions
+87
-12
available_models/conf.json
+
6
−
4
View file @
4b3685d8
...
...
@@ -375,26 +375,28 @@
},
{
"id"
:
57
,
"model"
:
"model_enBeng-en-to-b
eng-
2_2020-
0
2-
1
8-model_step_
20
0000.pt"
,
"model"
:
"model_enBeng-en-to-b
n-3.
2_2020-
1
2-
2
8-model_step_
19
0000.pt"
,
"timeout"
:
600
,
"on_timeout"
:
"to_cpu"
,
"load"
:
true
,
"opt"
:
{
"beam_size"
:
5
,
"log_file"
:
"log_file.txt"
,
"log_file_level"
:
"INFO"
"log_file_level"
:
"INFO"
,
"gpu"
:
0
}
},
{
"id"
:
58
,
"model"
:
"model_enBeng-b
eng
-to-en-
1
_2020-
0
2-
1
8-model_step_
20
0000.pt"
,
"model"
:
"model_enBeng-b
n
-to-en-
2.2
_2020-
1
2-
2
8-model_step_
18
0000.pt"
,
"timeout"
:
600
,
"on_timeout"
:
"to_cpu"
,
"load"
:
true
,
"opt"
:
{
"beam_size"
:
5
,
"log_file"
:
"log_file.txt"
,
"log_file_level"
:
"INFO"
"log_file_level"
:
"INFO"
,
"gpu"
:
0
}
},
{
...
...
This diff is collapsed.
Click to expand it.
config/sentencepiece_model_loc.py
+
4
−
0
View file @
4b3685d8
...
...
@@ -43,6 +43,10 @@ english_bengali = {
"BENG_120919"
:
"model/sentencepiece_models/beng-2019-09-12-10k.model"
,
"ENG_180220"
:
"model/sentencepiece_models/enBeng-en-to-beng-2-2020-02-18-24k.model"
,
"BENG_180220"
:
"model/sentencepiece_models/bengali-en-to-beng-2-2020-02-18-24k.model"
,
"ENG_281220"
:
"model/sentencepiece_models/enBeng-en-to-bn-3.2-2020-12-28-24k.model"
,
"BENG_281220"
:
"model/sentencepiece_models/bengali-en-to-bn-3.2-2020-12-28-24k.model"
,
"ENG_281220_2.2"
:
"model/sentencepiece_models/enBeng-bn-to-en-2.2-2020-12-28-24k.model"
,
"BENG_281220_2.2"
:
"model/sentencepiece_models/bengali-bn-to-en-2.2-2020-12-28-24k.model"
,
}
english_marathi
=
{
...
...
This diff is collapsed.
Click to expand it.
corpus/helper_functions/token_manipulator.py
+
71
−
2
View file @
4b3685d8
import
csv
def
token_finder
(
input_file
,
token
):
try
:
out_file
=
"file_with_{}"
.
format
(
token
)
...
...
@@ -63,7 +65,74 @@ def get_indices_for_same_lines_among_files(file_1,file_2):
print
(
e
)
index_array
=
token_finder
(
"corpus/master_corpus/english_hindi/eng_train_corpus_final.txt"
,
""
)
def
token_list_finder
(
input_file
,
token
,
outfile
,
no_match_tokens
,
one_match_tokens
):
try
:
# out_file = "file_with_{}".format("token_list")
sent_count
=
0
line_numbers
=
list
()
# outfile = open("{0}".format(out_file), "w")
with
open
(
input_file
)
as
input_file
:
for
num
,
line
in
enumerate
(
input_file
,
1
):
if
any
(
v
in
line
for
v
in
[
token
,
token
.
lower
(),
token
.
title
()]):
outfile
.
write
(
line
)
sent_count
=
sent_count
+
1
if
sent_count
==
2
:
return
if
sent_count
==
0
:
no_match_tokens
.
append
(
token
)
# line_numbers.append(num)
if
sent_count
==
1
:
one_match_tokens
.
append
(
token
)
# outfile.close()
# print(line_numbers)
return
None
except
Exception
as
e
:
print
(
"exception: "
,
e
)
def
create_tmx_data
(
tokens
):
one_match_tokens
=
list
()
no_match_tokens
=
list
()
out_file
=
"file_with_{}"
.
format
(
"token_list"
)
outfile
=
open
(
"{0}"
.
format
(
out_file
),
"w"
)
tokens_list
=
tokens
for
token
in
tokens_list
:
token_list_finder
(
"corpus/original_data/english_bengali/final_bn_source.txt"
,
token
,
outfile
,
no_match_tokens
,
one_match_tokens
)
print
(
"all done"
)
print
(
len
(
no_match_tokens
))
print
(
"final no match: "
,
no_match_tokens
)
print
(
len
(
one_match_tokens
))
print
(
one_match_tokens
)
def
csv_to_list
():
token_list
=
list
()
with
open
(
'corpus/tmx-english.csv'
,
'r'
)
as
fd
:
reader
=
csv
.
reader
(
fd
)
for
row
in
reader
:
token_list
.
append
(
str
(
row
[
0
]))
# print(token_list)
print
(
len
(
token_list
))
print
(
"ddone"
)
return
token_list
tokens
=
csv_to_list
()
create_tmx_data
(
tokens
)
def
drop_duplicate
(
inFile
,
outFile
):
lines_seen
=
set
()
# holds lines already seen
outfile
=
open
(
"{0}"
.
format
(
outFile
),
"w"
)
for
line
in
open
(
"{0}"
.
format
(
inFile
),
"r"
):
if
line
not
in
lines_seen
:
# not a duplicate
outfile
.
write
(
line
)
lines_seen
.
add
(
line
)
outfile
.
close
()
# drop_duplicate("file_with_token_list","file_with_token_list_nodup-2")
# index_array = token_finder("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt","" )
# line_extracter_using_index("corpus/master_corpus/english_hindi/eng_train_corpus_final.txt",index_array,"test_en.txt")
line_extracter_using_index
(
"corpus/master_corpus/english_hindi/hindi_train_corpus_final.txt-1-1"
,
index_array
,
"test.txt"
)
#
line_extracter_using_index("corpus/master_corpus/english_hindi/hindi_train_corpus_final.txt-1-1",index_array,"test.txt")
# lines_without_token("test3.txt","ADV","test4.txt")
This diff is collapsed.
Click to expand it.
translation_util/translate_util.py
+
4
−
4
View file @
4b3685d8
...
...
@@ -353,14 +353,14 @@ def translate_func(inputs, translation_server):
translation
,
scores
,
input_sw
,
output_sw
=
encode_translate_decode
(
i
,
translation_server
,
sp_model
.
english_punjabi
[
"PUNJABI_160220"
],
sp_model
.
english_punjabi
[
"ENG_160220"
])
translation
=
sentence_processor
.
moses_detokenizer
(
translation
)
elif
i
[
'id'
]
==
57
:
"en-bengali
2n
d"
"en-bengali
3r
d"
i
[
'src'
]
=
sentence_processor
.
moses_tokenizer
(
i
[
'src'
])
translation
,
scores
,
input_sw
,
output_sw
=
encode_translate_decode
(
i
,
translation_server
,
sp_model
.
english_bengali
[
"ENG_
180
220"
],
sp_model
.
english_bengali
[
"BENG_
180
220"
])
translation
,
scores
,
input_sw
,
output_sw
=
encode_translate_decode
(
i
,
translation_server
,
sp_model
.
english_bengali
[
"ENG_
281
220"
],
sp_model
.
english_bengali
[
"BENG_
281
220"
])
translation
=
sentence_processor
.
indic_detokenizer
(
translation
)
elif
i
[
'id'
]
==
58
:
"bengali-en
1st
"
"bengali-en
2nd
"
i
[
'src'
]
=
sentence_processor
.
indic_tokenizer
(
i
[
'src'
])
translation
,
scores
,
input_sw
,
output_sw
=
encode_translate_decode
(
i
,
translation_server
,
sp_model
.
english_bengali
[
"BENG_
180220
"
],
sp_model
.
english_bengali
[
"ENG_
180220
"
])
translation
,
scores
,
input_sw
,
output_sw
=
encode_translate_decode
(
i
,
translation_server
,
sp_model
.
english_bengali
[
"BENG_
281220_2.2
"
],
sp_model
.
english_bengali
[
"ENG_
281220_2.2
"
])
translation
=
sentence_processor
.
moses_detokenizer
(
translation
)
elif
i
[
'id'
]
==
59
:
"en-malay 2nd"
...
...
This diff is collapsed.
Click to expand it.
utils/training_utils/onmt_utils.py
+
2
−
2
View file @
4b3685d8
...
...
@@ -26,11 +26,11 @@ def onmt_train(f_in):
os
.
system
(
'rm -f {0} {1} {2} {3}'
.
format
(
f_in
[
'train_src'
],
f_in
[
'train_tgt'
],
f_in
[
'valid_src'
],
f_in
[
'valid_tgt'
]))
logger
.
info
(
"removed files, starting training for epoch:{}"
.
format
(
f_in
[
'epoch'
]))
os
.
system
(
'python train.py -data {0} -save_model {1} -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8
\
os
.
system
(
'
nohup
python train.py -data {0} -save_model {1} -layers 6 -rnn_size 512 -word_vec_size 512 -transformer_ff 2048 -heads 8
\
-encoder_type transformer -decoder_type transformer -position_encoding -train_steps {2} -max_generator_batches 2 -dropout 0.1
\
-batch_size 6000 -batch_type tokens -normalization tokens -accum_count 2 -optim adam -adam_beta2 0.998 -decay_method noam
\
-warmup_steps 8000 -learning_rate 0.25 -max_grad_norm 0 -param_init 0 -param_init_glorot -label_smoothing 0.1 -valid_steps 10000
\
-save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0'
.
format
(
f_in
[
'nmt_processed_data'
],
f_in
[
'nmt_model_path'
],
f_in
[
'epoch'
]))
-save_checkpoint_steps 10000 -world_size 1 -gpu_ranks 0
&
'
.
format
(
f_in
[
'nmt_processed_data'
],
f_in
[
'nmt_model_path'
],
f_in
[
'epoch'
]))
except
Exception
as
e
:
logger
.
error
(
"error in onmt_train utils-anuvaad script: {}"
.
format
(
e
))
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment
Menu
Explore
Projects
Groups
Topics
Snippets