First convert your OpenNMT-py or OpenNMT-tf model to a CTranslate2 model.
January 18, 2023 · View on GitHub
pip3 install ctranslate2
• OpenNMT-py:
ct2-opennmt-py-converter --model_path model.pt --output_dir enja_ctranslate2 --quantization int8
• OpenNMT-tf:
ct2-opennmt-tf-converter --model_path model --output_dir enja_ctranslate2 --src_vocab source.vocab --tgt_vocab target.vocab --model_type TransformerBase --quantization int8
import ctranslate2 import sentencepiece as spm
Set file paths
source_file_path = "test.en" target_file_path = "test.ja"
sp_source_model_path = "spm_model.en" sp_target_model_path = "spm_model.ja"
ct_model_path = "enja_ctranslate2/"
Load the source SentecePiece model
sp = spm.SentencePieceProcessor() sp.load(sp_source_model_path)
Open the source file
with open(source_file_path, "r") as source: lines = source.readlines()
source_sents = [line.strip() for line in lines]
Subword the source sentences
source_sents_subworded = sp.encode_as_pieces(source_sents)
Translate the source sentences
translator = ctranslate2.Translator(ct_model_path, device="cpu") # or "cuda" for GPU translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=4096) translations = [translation.hypotheses[0] for translation in translations]
Load the target SentecePiece model
sp.load(sp_target_model_path)
Desubword the target sentences
translations_desubword = sp.decode(translations)
Save the translations to the a file
with open(target_file_path, "w+", encoding="utf-8") as target: for line in translations_desubword: target.write(line.strip() + "\n")
print("Done")