Train¶

train.py

usage: train.py [-h] [-config CONFIG] [-save_config SAVE_CONFIG] -data DATA
                [-skip_empty_level {silent,warning,error}]
                [-transforms {insert_mask_before_placeholder,uppercase,inlinetags,bart,terminology,docify,inferfeats,filtertoolong,prefix,suffix,fuzzymatch,clean,switchout,tokendrop,tokenmask,sentencepiece,bpe,onmt_tokenize,normalize} [{insert_mask_before_placeholder,uppercase,inlinetags,bart,terminology,docify,inferfeats,filtertoolong,prefix,suffix,fuzzymatch,clean,switchout,tokendrop,tokenmask,sentencepiece,bpe,onmt_tokenize,normalize} ...]]
                [-save_data SAVE_DATA] [-overwrite] [-n_sample N_SAMPLE]
                [-dump_transforms] -src_vocab SRC_VOCAB [-tgt_vocab TGT_VOCAB]
                [-share_vocab] [--decoder_start_token DECODER_START_TOKEN]
                [--default_specials DEFAULT_SPECIALS [DEFAULT_SPECIALS ...]]
                [-n_src_feats N_SRC_FEATS]
                [-src_feats_defaults SRC_FEATS_DEFAULTS]
                [-src_vocab_size SRC_VOCAB_SIZE]
                [-tgt_vocab_size TGT_VOCAB_SIZE]
                [-vocab_size_multiple VOCAB_SIZE_MULTIPLE]
                [-src_words_min_frequency SRC_WORDS_MIN_FREQUENCY]
                [-tgt_words_min_frequency TGT_WORDS_MIN_FREQUENCY]
                [--src_seq_length_trunc SRC_SEQ_LENGTH_TRUNC]
                [--tgt_seq_length_trunc TGT_SEQ_LENGTH_TRUNC]
                [-both_embeddings BOTH_EMBEDDINGS]
                [-src_embeddings SRC_EMBEDDINGS]
                [-tgt_embeddings TGT_EMBEDDINGS]
                [-embeddings_type {GloVe,word2vec}]
                [--response_patterns RESPONSE_PATTERNS [RESPONSE_PATTERNS ...]]
                [--upper_corpus_ratio UPPER_CORPUS_RATIO]
                [--tags_dictionary_path TAGS_DICTIONARY_PATH]
                [--tags_corpus_ratio TAGS_CORPUS_RATIO] [--max_tags MAX_TAGS]
                [--paired_stag PAIRED_STAG] [--paired_etag PAIRED_ETAG]
                [--isolated_tag ISOLATED_TAG] [--src_delimiter SRC_DELIMITER]
                [--permute_sent_ratio PERMUTE_SENT_RATIO]
                [--rotate_ratio ROTATE_RATIO] [--insert_ratio INSERT_RATIO]
                [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO]
                [--mask_length {subword,word,span-poisson}]
                [--poisson_lambda POISSON_LAMBDA] [--replace_length {-1,0,1}]
                [--termbase_path TERMBASE_PATH]
                [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL]
                [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL]
                [--term_corpus_ratio TERM_CORPUS_RATIO]
                [--term_example_ratio TERM_EXAMPLE_RATIO]
                [--src_term_stoken SRC_TERM_STOKEN]
                [--tgt_term_stoken TGT_TERM_STOKEN]
                [--tgt_term_etoken TGT_TERM_ETOKEN]
                [--term_source_delimiter TERM_SOURCE_DELIMITER]
                [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT]
                [--reversible_tokenization {joiner,spacer}]
                [--src_seq_length SRC_SEQ_LENGTH]
                [--tgt_seq_length TGT_SEQ_LENGTH] [--src_prefix SRC_PREFIX]
                [--tgt_prefix TGT_PREFIX] [--src_suffix SRC_SUFFIX]
                [--tgt_suffix TGT_SUFFIX] [--tm_path TM_PATH]
                [--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO]
                [--fuzzy_threshold FUZZY_THRESHOLD]
                [--tm_delimiter TM_DELIMITER] [--fuzzy_token FUZZY_TOKEN]
                [--fuzzymatch_min_length FUZZYMATCH_MIN_LENGTH]
                [--fuzzymatch_max_length FUZZYMATCH_MAX_LENGTH] [--src_eq_tgt]
                [--same_char] [--same_word] [--scripts_ok [SCRIPTS_OK ...]]
                [--scripts_nok [SCRIPTS_NOK ...]]
                [--src_tgt_ratio SRC_TGT_RATIO] [--avg_tok_min AVG_TOK_MIN]
                [--avg_tok_max AVG_TOK_MAX] [--langid [LANGID ...]]
                [-switchout_temperature SWITCHOUT_TEMPERATURE]
                [-tokendrop_temperature TOKENDROP_TEMPERATURE]
                [-tokenmask_temperature TOKENMASK_TEMPERATURE]
                [-src_subword_model SRC_SUBWORD_MODEL]
                [-tgt_subword_model TGT_SUBWORD_MODEL]
                [-src_subword_nbest SRC_SUBWORD_NBEST]
                [-tgt_subword_nbest TGT_SUBWORD_NBEST]
                [-src_subword_alpha SRC_SUBWORD_ALPHA]
                [-tgt_subword_alpha TGT_SUBWORD_ALPHA]
                [-src_subword_vocab SRC_SUBWORD_VOCAB]
                [-tgt_subword_vocab TGT_SUBWORD_VOCAB]
                [-src_vocab_threshold SRC_VOCAB_THRESHOLD]
                [-tgt_vocab_threshold TGT_VOCAB_THRESHOLD]
                [-src_subword_type {none,sentencepiece,bpe}]
                [-tgt_subword_type {none,sentencepiece,bpe}]
                [-src_onmttok_kwargs SRC_ONMTTOK_KWARGS]
                [-tgt_onmttok_kwargs TGT_ONMTTOK_KWARGS] [--gpt2_pretok]
                [--src_lang SRC_LANG] [--tgt_lang TGT_LANG] [--penn PENN]
                [--norm_quote_commas NORM_QUOTE_COMMAS]
                [--norm_numbers NORM_NUMBERS]
                [--pre_replace_unicode_punct PRE_REPLACE_UNICODE_PUNCT]
                [--post_remove_control_chars POST_REMOVE_CONTROL_CHARS]
                [--gpu_ranks [GPU_RANKS ...]] [--world_size WORLD_SIZE]
                [--parallel_mode {tensor_parallel,data_parallel}]
                [--gpu_backend GPU_BACKEND]
                [--gpu_verbose_level GPU_VERBOSE_LEVEL]
                [--master_ip MASTER_IP] [--master_port MASTER_PORT]
                [--timeout TIMEOUT] [--src_word_vec_size SRC_WORD_VEC_SIZE]
                [--tgt_word_vec_size TGT_WORD_VEC_SIZE]
                [--word_vec_size WORD_VEC_SIZE] [--share_decoder_embeddings]
                [--share_embeddings] [--position_encoding]
                [--position_encoding_type {SinusoidalInterleaved,SinusoidalConcat}]
                [-update_vocab] [--feat_merge {concat,sum,mlp}]
                [--feat_vec_size FEAT_VEC_SIZE]
                [--feat_vec_exponent FEAT_VEC_EXPONENT]
                [-model_task {seq2seq,lm}] [--model_type {text}]
                [--model_dtype {fp32,fp16}] [--encoder_type ENCODER_TYPE]
                [--decoder_type DECODER_TYPE] [--freeze_encoder]
                [--freeze_decoder] [--layers LAYERS] [--enc_layers ENC_LAYERS]
                [--dec_layers DEC_LAYERS] [--hidden_size HIDDEN_SIZE]
                [--enc_hid_size ENC_HID_SIZE] [--dec_hid_size DEC_HID_SIZE]
                [--cnn_kernel_width CNN_KERNEL_WIDTH]
                [--layer_norm {standard,rms}] [--norm_eps NORM_EPS]
                [--pos_ffn_activation_fn {relu,gelu,silu,gated-gelu}]
                [--input_feed INPUT_FEED] [--bridge]
                [--rnn_type {LSTM,GRU,SRU}]
                [--context_gate {source,target,both}]
                [--bridge_extra_node BRIDGE_EXTRA_NODE]
                [--bidir_edges BIDIR_EDGES] [--state_dim STATE_DIM]
                [--n_edge_types N_EDGE_TYPES] [--n_node N_NODE]
                [--n_steps N_STEPS] [--src_ggnn_size SRC_GGNN_SIZE]
                [--global_attention {dot,general,mlp,none}]
                [--global_attention_function {softmax,sparsemax}]
                [--self_attn_type SELF_ATTN_TYPE]
                [--max_relative_positions MAX_RELATIVE_POSITIONS]
                [--relative_positions_buckets RELATIVE_POSITIONS_BUCKETS]
                [--rotary_interleave] [--rotary_theta ROTARY_THETA]
                [--rotary_dim ROTARY_DIM] [--heads HEADS]
                [--sliding_window SLIDING_WINDOW]
                [--transformer_ff TRANSFORMER_FF] [--num_experts NUM_EXPERTS]
                [--num_experts_per_tok NUM_EXPERTS_PER_TOK] [--aan_useffn]
                [--add_qkvbias] [--multiquery] [--num_kv NUM_KV]
                [--add_ffnbias] [--parallel_residual] [--shared_layer_norm]
                [--lambda_align LAMBDA_ALIGN]
                [--alignment_layer ALIGNMENT_LAYER]
                [--alignment_heads ALIGNMENT_HEADS] [--full_context_alignment]
                [--copy_attn] [--copy_attn_type {dot,general,mlp,none}]
                [--generator_function {softmax,sparsemax}] [--copy_attn_force]
                [--reuse_copy_attn] [--copy_loss_by_seqlength]
                [--coverage_attn] [--lambda_coverage LAMBDA_COVERAGE]
                [--lm_prior_model LM_PRIOR_MODEL]
                [--lm_prior_lambda LM_PRIOR_LAMBDA]
                [--lm_prior_tau LM_PRIOR_TAU] [--loss_scale LOSS_SCALE]
                [--apex_opt_level {,O0,O1,O2,O3}] [--zero_out_prompt_loss]
                [--use_ckpting {ffn,mha,lora} [{ffn,mha,lora} ...]]
                [--data_type DATA_TYPE] [-bucket_size BUCKET_SIZE]
                [-bucket_size_init BUCKET_SIZE_INIT]
                [-bucket_size_increment BUCKET_SIZE_INCREMENT]
                [-prefetch_factor PREFETCH_FACTOR] [--save_model SAVE_MODEL]
                [--save_format {pytorch,safetensors}]
                [--save_checkpoint_steps SAVE_CHECKPOINT_STEPS]
                [--keep_checkpoint KEEP_CHECKPOINT]
                [--lora_layers LORA_LAYERS [LORA_LAYERS ...]]
                [--lora_embedding] [--lora_rank LORA_RANK]
                [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
                [--seed SEED] [--param_init PARAM_INIT] [--param_init_glorot]
                [--train_from TRAIN_FROM]
                [--reset_optim {none,all,states,keep_states}]
                [--pre_word_vecs_enc PRE_WORD_VECS_ENC]
                [--pre_word_vecs_dec PRE_WORD_VECS_DEC]
                [--freeze_word_vecs_enc] [--freeze_word_vecs_dec]
                [--num_workers NUM_WORKERS] [--batch_size BATCH_SIZE]
                [--batch_size_multiple BATCH_SIZE_MULTIPLE]
                [--batch_type {sents,tokens}] [--normalization {sents,tokens}]
                [--accum_count ACCUM_COUNT [ACCUM_COUNT ...]]
                [--accum_steps ACCUM_STEPS [ACCUM_STEPS ...]]
                [--valid_steps VALID_STEPS]
                [--valid_batch_size VALID_BATCH_SIZE]
                [--train_steps TRAIN_STEPS] [--single_pass]
                [--early_stopping EARLY_STOPPING]
                [--early_stopping_criteria [EARLY_STOPPING_CRITERIA ...]]
                [--optim {sgd,adagrad,adadelta,adam,sparseadam,adafactor,fusedadam,adamw8bit,pagedadamw8bit,pagedadamw32bit}]
                [--adagrad_accumulator_init ADAGRAD_ACCUMULATOR_INIT]
                [--max_grad_norm MAX_GRAD_NORM]
                [--dropout DROPOUT [DROPOUT ...]]
                [--attention_dropout ATTENTION_DROPOUT [ATTENTION_DROPOUT ...]]
                [--dropout_steps DROPOUT_STEPS [DROPOUT_STEPS ...]]
                [--truncated_decoder TRUNCATED_DECODER]
                [--adam_beta1 ADAM_BETA1] [--adam_beta2 ADAM_BETA2]
                [--label_smoothing LABEL_SMOOTHING]
                [--average_decay AVERAGE_DECAY]
                [--average_every AVERAGE_EVERY]
                [--learning_rate LEARNING_RATE]
                [--learning_rate_decay LEARNING_RATE_DECAY]
                [--start_decay_steps START_DECAY_STEPS]
                [--decay_steps DECAY_STEPS]
                [--decay_method {noam,noamwd,rsqrt,none}]
                [--warmup_steps WARMUP_STEPS] [--log_file LOG_FILE]
                [--log_file_level {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET,50,40,30,20,10,0}]
                [--verbose]
                [--valid_metrics VALID_METRICS [VALID_METRICS ...]]
                [--scoring_debug] [--dump_preds DUMP_PREDS]
                [--report_every REPORT_EVERY] [--exp_host EXP_HOST]
                [--exp EXP] [--tensorboard]
                [--tensorboard_log_dir TENSORBOARD_LOG_DIR] [--override_opts]
                [--quant_layers QUANT_LAYERS [QUANT_LAYERS ...]]
                [--quant_type {,bnb_8bit,bnb_FP4,bnb_NF4,awq_gemm,awq_gemv}]
                [--w_bit {4}] [--group_size {128}]

Configuration¶

-config, --config: Path of the main YAML config file.
-save_config, --save_config: Path where to save the config.

Data¶

-data, --data

List of datasets and their specifications. See examples/*.yaml for further details.

-skip_empty_level, --skip_empty_level

Possible choices: silent, warning, error

Security level when encounter empty examples.silent: silently ignore/skip empty example;warning: warning when ignore/skip empty example;error: raise error & stop execution when encouter empty.

Default: “warning”

-transforms, --transforms

Possible choices: insert_mask_before_placeholder, uppercase, inlinetags, bart, terminology, docify, inferfeats, filtertoolong, prefix, suffix, fuzzymatch, clean, switchout, tokendrop, tokenmask, sentencepiece, bpe, onmt_tokenize, normalize

Default transform pipeline to apply to data. Can be specified in each corpus of data to override.

Default: []

-save_data, --save_data

Output base path for objects that will be saved (vocab, transforms, embeddings, …).

-overwrite, --overwrite

Overwrite existing objects if any.

Default: False

-n_sample, --n_sample

Stop after save this number of transformed samples/corpus. Can be [-1, 0, N>0]. Set to -1 to go full corpus, 0 to skip.

Default: 0

-dump_transforms, --dump_transforms

Dump transforms *.transforms.pt to disk. -save_data should be set as saving prefix.

Default: False

Vocab¶

-src_vocab, --src_vocab

Path to src (or shared) vocabulary file. Format: one <word> or <word> <count> per line.

-tgt_vocab, --tgt_vocab

Path to tgt vocabulary file. Format: one <word> or <word> <count> per line.

-share_vocab, --share_vocab

Share source and target vocabulary.

Default: False

--decoder_start_token, -decoder_start_token

Default decoder start token for most ONMT models it is <s> = BOS it happens that for some Fairseq model it requires </s>

Default: “<s>”

--default_specials, -default_specials

default specials used for Vocab initialization UNK, PAD, BOS, EOS will take IDs 0, 1, 2, 3 typically <unk> <blank> <s> </s>

Default: [‘<unk>’, ‘<blank>’, ‘<s>’, ‘</s>’]

-src_vocab_size, --src_vocab_size

Maximum size of the source vocabulary.

Default: 32768

-tgt_vocab_size, --tgt_vocab_size

Maximum size of the target vocabulary

Default: 32768

-vocab_size_multiple, --vocab_size_multiple

Make the vocabulary size a multiple of this value.

Default: 8

-src_words_min_frequency, --src_words_min_frequency

Discard source words with lower frequency.

Default: 0

-tgt_words_min_frequency, --tgt_words_min_frequency

Discard target words with lower frequency.

Default: 0

Features¶

-n_src_feats, --n_src_feats

Number of source feats.

Default: 0

-src_feats_defaults, --src_feats_defaults

Default features to apply in source in case there are not annotated

Pruning¶

--src_seq_length_trunc, -src_seq_length_trunc: Truncate source sequence length.
--tgt_seq_length_trunc, -tgt_seq_length_trunc: Truncate target sequence length.

Embeddings¶

-both_embeddings, --both_embeddings

Path to the embeddings file to use for both source and target tokens.

-src_embeddings, --src_embeddings

Path to the embeddings file to use for source tokens.

-tgt_embeddings, --tgt_embeddings

Path to the embeddings file to use for target tokens.

-embeddings_type, --embeddings_type

Possible choices: GloVe, word2vec

Type of embeddings file.

Transform/InsertMaskBeforePlaceholdersTransform¶

--response_patterns, -response_patterns

Response patten to locate the end of the prompt

Default: [‘Response : ｟newline｠’]

Transform/Uppercase¶

--upper_corpus_ratio, -upper_corpus_ratio

Corpus ratio to apply uppercasing.

Default: 0.01

Transform/InlineTags¶

--tags_dictionary_path, -tags_dictionary_path

Path to a flat term dictionary.

--tags_corpus_ratio, -tags_corpus_ratio

Ratio of corpus to augment with tags.

Default: 0.1

--max_tags, -max_tags

Maximum number of tags that can be added to a single sentence.

Default: 12

--paired_stag, -paired_stag

The format of an opening paired inline tag. Must include the character #.

Default: “｟ph_#_beg｠”

--paired_etag, -paired_etag

The format of a closing paired inline tag. Must include the character #.

Default: “｟ph_#_end｠”

--isolated_tag, -isolated_tag

The format of an isolated inline tag. Must include the character #.

Default: “｟ph_#_std｠”

--src_delimiter, -src_delimiter

Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

Default: “｟fuzzy｠”

Transform/BART¶

--permute_sent_ratio, -permute_sent_ratio

Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.

Default: 0.0

--rotate_ratio, -rotate_ratio

Rotate this proportion of inputs.

Default: 0.0

--insert_ratio, -insert_ratio

Insert this percentage of additional random tokens.

Default: 0.0

--random_ratio, -random_ratio

Instead of using <mask>, use random token this often.

Default: 0.0

--mask_ratio, -mask_ratio

Fraction of words/subwords that will be masked.

Default: 0.0

--mask_length, -mask_length

Possible choices: subword, word, span-poisson

Length of masking window to apply.

Default: “subword”

--poisson_lambda, -poisson_lambda

Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

Default: 3.0

--replace_length, -replace_length

Possible choices: -1, 0, 1

When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

Default: -1

Transform/Terminology¶

--termbase_path, -termbase_path

Path to a dictionary file with terms.

--src_spacy_language_model, -src_spacy_language_model

Name of the spacy language model for the source corpus.

--tgt_spacy_language_model, -tgt_spacy_language_model

Name of the spacy language model for the target corpus.

--term_corpus_ratio, -term_corpus_ratio

Ratio of corpus to augment with terms.

Default: 0.3

--term_example_ratio, -term_example_ratio

Max terms allowed in an example.

Default: 0.2

--src_term_stoken, -src_term_stoken

The source term start token.

Default: “｟src_term_start｠”

--tgt_term_stoken, -tgt_term_stoken

The target term start token.

Default: “｟tgt_term_start｠”

--tgt_term_etoken, -tgt_term_etoken

The target term end token.

Default: “｟tgt_term_end｠”

--term_source_delimiter, -term_source_delimiter

Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

Default: “｟fuzzy｠”

Transform/Docify¶

--doc_length, -doc_length

Number of tokens per doc.

Default: 200

--max_context, -max_context

Max context segments.

Default: 1

Transform/InferFeats¶

--reversible_tokenization, -reversible_tokenization

Possible choices: joiner, spacer

Type of reversible tokenization applied on the tokenizer.

Default: “joiner”

Transform/Filter¶

--src_seq_length, -src_seq_length

Maximum source sequence length.

Default: 192

--tgt_seq_length, -tgt_seq_length

Maximum target sequence length.

Default: 192

Transform/Prefix¶

--src_prefix, -src_prefix

String to prepend to all source example.

Default: “”

--tgt_prefix, -tgt_prefix

String to prepend to all target example.

Default: “”

Transform/Suffix¶

--src_suffix, -src_suffix

String to append to all source example.

Default: “”

--tgt_suffix, -tgt_suffix

String to append to all target example.

Default: “”

Transform/FuzzyMatching¶

--tm_path, -tm_path

Path to a flat text TM.

--fuzzy_corpus_ratio, -fuzzy_corpus_ratio

Ratio of corpus to augment with fuzzy matches.

Default: 0.1

--fuzzy_threshold, -fuzzy_threshold

The fuzzy matching threshold.

Default: 70

--tm_delimiter, -tm_delimiter

The delimiter used in the flat text TM.

Default: “ “

--fuzzy_token, -fuzzy_token

The fuzzy token to be added with the matches.

Default: “｟fuzzy｠”

--fuzzymatch_min_length, -fuzzymatch_min_length

Min length for TM entries and examples to match.

Default: 4

--fuzzymatch_max_length, -fuzzymatch_max_length

Max length for TM entries and examples to match.

Default: 70

Transform/Clean¶

--src_eq_tgt, -src_eq_tgt

Remove ex src==tgt

Default: False

--same_char, -same_char

Remove ex with same char more than 4 times

Default: False

--same_word, -same_word

Remove ex with same word more than 3 times

Default: False

--scripts_ok, -scripts_ok

list of unicodata scripts accepted

Default: [‘Latin’, ‘Common’]

--scripts_nok, -scripts_nok

list of unicodata scripts not accepted

Default: []

--src_tgt_ratio, -src_tgt_ratio

ratio between src and tgt

Default: 2

--avg_tok_min, -avg_tok_min

average length of tokens min

Default: 3

--avg_tok_max, -avg_tok_max

average length of tokens max

Default: 20

--langid, -langid

list of languages accepted

Default: []

Transform/SwitchOut¶

-switchout_temperature, --switchout_temperature

Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.

Default: 1.0

Transform/Token_Drop¶

-tokendrop_temperature, --tokendrop_temperature

Sampling temperature for token deletion.

Default: 1.0

Transform/Token_Mask¶

-tokenmask_temperature, --tokenmask_temperature

Sampling temperature for token masking.

Default: 1.0

Transform/Subword/Common¶

Attention

Common options shared by all subword transforms. Including options for indicate subword model path, Subword Regularization/BPE-Dropout, and Vocabulary Restriction.

-src_subword_model, --src_subword_model

Path of subword model for src (or shared).

-tgt_subword_model, --tgt_subword_model

Path of subword model for tgt.

-src_subword_nbest, --src_subword_nbest

Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (source side)

Default: 1

-tgt_subword_nbest, --tgt_subword_nbest

Number of candidates in subword regularization. Valid for unigram sampling, invalid for BPE-dropout. (target side)

Default: 1

-src_subword_alpha, --src_subword_alpha

Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (source side)

Default: 0

-tgt_subword_alpha, --tgt_subword_alpha

Smoothing parameter for sentencepiece unigram sampling, and dropout probability for BPE-dropout. (target side)

Default: 0

-src_subword_vocab, --src_subword_vocab

Path to the vocabulary file for src subword. Format: <word> <count> per line.

Default: “”

-tgt_subword_vocab, --tgt_subword_vocab

Path to the vocabulary file for tgt subword. Format: <word> <count> per line.

Default: “”

-src_vocab_threshold, --src_vocab_threshold

Only produce src subword in src_subword_vocab with frequency >= src_vocab_threshold.

Default: 0

-tgt_vocab_threshold, --tgt_vocab_threshold

Only produce tgt subword in tgt_subword_vocab with frequency >= tgt_vocab_threshold.

Default: 0

Transform/Subword/ONMTTOK¶

-src_subword_type, --src_subword_type

Possible choices: none, sentencepiece, bpe

Type of subword model for src (or shared) in pyonmttok.

Default: “none”

-tgt_subword_type, --tgt_subword_type

Possible choices: none, sentencepiece, bpe

Type of subword model for tgt in pyonmttok.

Default: “none”

-src_onmttok_kwargs, --src_onmttok_kwargs

Other pyonmttok options for src in dict string, except subword related options listed earlier.

Default: “{‘mode’: ‘none’}”

-tgt_onmttok_kwargs, --tgt_onmttok_kwargs

Other pyonmttok options for tgt in dict string, except subword related options listed earlier.

Default: “{‘mode’: ‘none’}”

--gpt2_pretok, -gpt2_pretok

Preprocess sentence with byte-level mapping

Default: False

Transform/Normalize¶

--src_lang, -src_lang

Source language code

Default: “”

--tgt_lang, -tgt_lang

Target language code

Default: “”

--penn, -penn

Penn substitution

Default: True

--norm_quote_commas, -norm_quote_commas

Normalize quotations and commas

Default: True

--norm_numbers, -norm_numbers

Normalize numbers

Default: True

--pre_replace_unicode_punct, -pre_replace_unicode_punct

Replace unicode punct

Default: False

--post_remove_control_chars, -post_remove_control_chars

Remove control chars

Default: False

Distributed¶

--gpu_ranks, -gpu_ranks

list of ranks of each process.

Default: []

--world_size, -world_size

total number of distributed processes.

Default: 1

--parallel_mode, -parallel_mode

Possible choices: tensor_parallel, data_parallel

Distributed mode.

Default: “data_parallel”

--gpu_backend, -gpu_backend

Type of torch distributed backend

Default: “nccl”

--gpu_verbose_level, -gpu_verbose_level

Gives more info on each process per GPU.

Default: 0

--master_ip, -master_ip

IP of master for torch.distributed training.

Default: “localhost”

--master_port, -master_port

Port of master for torch.distributed training.

Default: 10000

--timeout, -timeout

Timeout for one GOU to wait for the others.

Default: 60

Model-Embeddings¶

--src_word_vec_size, -src_word_vec_size

Word embedding size for src.

Default: 500

--tgt_word_vec_size, -tgt_word_vec_size

Word embedding size for tgt.

Default: 500

--word_vec_size, -word_vec_size

Word embedding size for src and tgt.

Default: -1

--share_decoder_embeddings, -share_decoder_embeddings

Use a shared weight matrix for the input and output word embeddings in the decoder.

Default: False

--share_embeddings, -share_embeddings

Share the word embeddings between encoder and decoder. Need to use shared dictionary for this option.

Default: False

--position_encoding, -position_encoding

Use a sin to mark relative words positions. Necessary for non-RNN style models.

Default: False

--position_encoding_type, -position_encoding_type

Possible choices: SinusoidalInterleaved, SinusoidalConcat

Type of positional encoding. At the moment: Sinusoidal fixed, Interleaved or Concat

Default: “SinusoidalInterleaved”

-update_vocab, --update_vocab

Update source and target existing vocabularies

Default: False

Model-Embedding Features¶

--feat_merge, -feat_merge

Possible choices: concat, sum, mlp

Merge action for incorporating features embeddings. Options [concat|sum|mlp].

Default: “concat”

--feat_vec_size, -feat_vec_size

If specified, feature embedding sizes will be set to this. Otherwise, feat_vec_exponent will be used.

Default: -1

--feat_vec_exponent, -feat_vec_exponent

If -feat_merge_size is not set, feature embedding sizes will be set to N^feat_vec_exponent where N is the number of values the feature takes.

Default: 0.7

Model- Task¶

-model_task, --model_task

Possible choices: seq2seq, lm

Type of task for the model either seq2seq or lm

Default: “seq2seq”

Model- Encoder-Decoder¶

--model_type, -model_type

Possible choices: text

Type of source model to use. Allows the system to incorporate non-text inputs. Options are [text].

Default: “text”

--model_dtype, -model_dtype

Possible choices: fp32, fp16

Data type of the model.

Default: “fp32”

--encoder_type, -encoder_type

Default: “rnn”

--decoder_type, -decoder_type

Type of decoder layer to use. Non-RNN layers are experimental. Default options are [rnn|transformer|cnn|transformer].

Default: “rnn”

--freeze_encoder, -freeze_encoder

Freeze parameters in encoder.

Default: False

--freeze_decoder, -freeze_decoder

Freeze parameters in decoder.

Default: False

--layers, -layers

Number of layers in enc/dec.

Default: -1

--enc_layers, -enc_layers

Number of layers in the encoder

Default: 2

--dec_layers, -dec_layers

Number of layers in the decoder

Default: 2

--hidden_size, -hidden_size

Size of rnn hidden states. Overwrites enc_hid_size and dec_hid_size

Default: -1

--enc_hid_size, -enc_hid_size

Size of encoder rnn hidden states.

Default: 500

--dec_hid_size, -dec_hid_size

Size of decoder rnn hidden states.

Default: 500

--cnn_kernel_width, -cnn_kernel_width

Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in conv layer

Default: 3

--layer_norm, -layer_norm

Possible choices: standard, rms

The type of layer normalization in the transformer architecture. Choices are standard or rms. Default to standard

Default: “standard”

--norm_eps, -norm_eps

Layer norm epsilon

Default: 1e-06

--pos_ffn_activation_fn, -pos_ffn_activation_fn

Possible choices: relu, gelu, silu, gated-gelu

The activation function to use in PositionwiseFeedForward layer. Choices are dict_keys([‘relu’, ‘gelu’, ‘silu’, ‘gated-gelu’]). Default to relu.

Default: “relu”

--input_feed, -input_feed

Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.

Default: 1

--bridge, -bridge

Have an additional layer between the last encoder state and the first decoder state

Default: False

--rnn_type, -rnn_type

Possible choices: LSTM, GRU, SRU

The gate type to use in the RNNs

Default: “LSTM”

--context_gate, -context_gate

Possible choices: source, target, both

Type of context gate to use. Do not select for no context gate.

--bridge_extra_node, -bridge_extra_node

Graph encoder bridges only extra node to decoder as input

Default: True

--bidir_edges, -bidir_edges

Graph encoder autogenerates bidirectional edges

Default: True

--state_dim, -state_dim

Number of state dimensions in the graph encoder

Default: 512

--n_edge_types, -n_edge_types

Number of edge types in the graph encoder

Default: 2

--n_node, -n_node

Number of nodes in the graph encoder

Default: 2

--n_steps, -n_steps

Number of steps to advance graph encoder

Default: 2

--src_ggnn_size, -src_ggnn_size

Vocab size plus feature space for embedding input

Default: 0

Model- Attention¶

--global_attention, -global_attention

Possible choices: dot, general, mlp, none

The attention type to use: dotprod or general (Luong) or MLP (Bahdanau)

Default: “general”

--global_attention_function, -global_attention_function

Possible choices: softmax, sparsemax

Default: “softmax”

--self_attn_type, -self_attn_type

Self attention type in Transformer decoder layer – currently “scaled-dot”, “scaled-dot-flash” or “average”

Default: “scaled-dot-flash”

--max_relative_positions, -max_relative_positions

This setting enable relative position encodingWe support two types of encodings:set this -1 to enable Rotary Embeddingsmore info: https://arxiv.org/abs/2104.09864set this to > 0 (ex: 16, 32) to useMaximum distance between inputs in relative positions representations. more info: https://arxiv.org/pdf/1803.02155.pdf

Default: 0

--relative_positions_buckets, -relative_positions_buckets

This setting enable relative position biasmore info: https://github.com/google-research/text-to-text-transfer-transformer

Default: 0

--rotary_interleave, -rotary_interleave

Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half.True = default Llama from Meta (original)False = used by all Hugging face models

Default: False

--rotary_theta, -rotary_theta

Rotary theta base length1e4 for Llama2.Mistral1e6 for Mixtral

Default: 10000

--rotary_dim, -rotary_dim

Rotary dim when model requires it to be different to head dim

Default: 0

--heads, -heads

Number of heads for transformer self-attention

Default: 8

--sliding_window, -sliding_window

sliding window for transformer self-attention

Default: 0

--transformer_ff, -transformer_ff

Size of hidden transformer feed-forward

Default: 2048

--num_experts, -num_experts

Number of experts

Default: 0

--num_experts_per_tok, -num_experts_per_tok

Number of experts per token

Default: 2

--aan_useffn, -aan_useffn

Turn on the FFN layer in the AAN decoder

Default: False

--add_qkvbias, -add_qkvbias

Add bias to nn.linear of Query/Key/Value in MHANote: this will add bias to output proj layer too

Default: False

--multiquery, -multiquery

Use MultiQuery attentionNote: https://arxiv.org/pdf/1911.02150.pdf

Default: False

--num_kv, -num_kv

Number of heads for KV in the variant of MultiQuery attention (egs: Falcon 40B)

Default: 0

--add_ffnbias, -add_ffnbias

Add bias to nn.linear of Position_wise FFN

Default: False

--parallel_residual, -parallel_residual

Use Parallel residual in Decoder LayerNote: this is used by GPT-J / Falcon Architecture

Default: False

--shared_layer_norm, -shared_layer_norm

Use a shared layer_norm in parallel residual attentionNote: must be true for Falcon 7B / false for Falcon 40Bsame for GPT-J and GPT-NeoX models

Default: False

Model - Alignement¶

--lambda_align, -lambda_align

Lambda value for alignement loss of Garg et al (2019)For more detailed information, see: https://arxiv.org/abs/1909.02074

Default: 0.0

--alignment_layer, -alignment_layer

Layer number which has to be supervised.

Default: -3

--alignment_heads, -alignment_heads

of cross attention heads per layer to supervised with

Default: 0

--full_context_alignment, -full_context_alignment

Whether alignment is conditioned on full target context.

Default: False

Generator¶

--copy_attn, -copy_attn

Train copy attention layer.

Default: False

--copy_attn_type, -copy_attn_type

Possible choices: dot, general, mlp, none

The copy attention type to use. Leave as None to use the same as -global_attention.

--generator_function, -generator_function

Possible choices: softmax, sparsemax

Which function to use for generating probabilities over the target vocabulary (choices: softmax, sparsemax)

Default: “softmax”

--copy_attn_force, -copy_attn_force

When available, train to copy.

Default: False

--reuse_copy_attn, -reuse_copy_attn

Reuse standard attention for copy

Default: False

--copy_loss_by_seqlength, -copy_loss_by_seqlength

Divide copy loss by length of sequence

Default: False

--coverage_attn, -coverage_attn

Train a coverage attention layer.

Default: False

--lambda_coverage, -lambda_coverage

Lambda value for coverage loss of See et al (2017)

Default: 0.0

--lm_prior_model, -lm_prior_model

LM model to used to train the TM

--lm_prior_lambda, -lambda_prior_lambda

LM Prior Lambda

Default: 0.0

--lm_prior_tau, -lambda_prior_tau

LM Prior Tau

Default: 1.0

--loss_scale, -loss_scale

For FP16 training, the static loss scale to use. If not set, the loss scale is dynamically computed.

Default: 0

--apex_opt_level, -apex_opt_level

Possible choices: , O0, O1, O2, O3

For FP16 training, the opt_level to use.See https://nvidia.github.io/apex/amp.html#opt-levels.

Default: “”

--zero_out_prompt_loss, -zero_out_prompt_loss

Set the prompt loss to zero.Mostly for LLM finetuning.Will be enabled only if the insert_mask_before_placeholder transform is applied

Default: False

--use_ckpting, -use_ckpting

Possible choices: ffn, mha, lora

use gradient checkpointing those modules

Default: []

General¶

--data_type, -data_type

Type of the source input. Options are [text].

Default: “text”

-bucket_size, --bucket_size

A bucket is a buffer of bucket_size examples to pick: from the various Corpora. The dynamic iterator batches batch_size batchs from the bucket and shuffle them.

Default: 262144

-bucket_size_init, --bucket_size_init

The bucket is initalized with this awith this: amount of examples (optional)

Default: -1

-bucket_size_increment, --bucket_size_increment

The bucket size is incremented with this: amount of examples (optional)

Default: 0

-prefetch_factor, --prefetch_factor

number of mini-batches loaded in advance to avoid the: GPU waiting during the refilling of the bucket.

Default: 200

--save_model, -save_model

Model filename (the model will be saved as <save_model>_N.pt where N is the number of steps

Default: “model”

--save_format, -save_format

Possible choices: pytorch, safetensors

Format to save the model weights

Default: “pytorch”

--save_checkpoint_steps, -save_checkpoint_steps

Save a checkpoint every X steps

Default: 5000

--keep_checkpoint, -keep_checkpoint

Keep X checkpoints (negative: keep all)

Default: -1

--lora_layers, -lora_layers

list of layers to be replaced by LoRa layers. ex: [‘linear_values’, ‘linear_query’] cf paper §4.2 https://arxiv.org/abs/2106.09685

Default: []

--lora_embedding, -lora_embedding

replace embeddings with LoRa Embeddings see §5.1

Default: False

--lora_rank, -lora_rank

r=2 successfully tested with NLLB-200 3.3B

Default: 2

--lora_alpha, -lora_alpha

§4.1 https://arxiv.org/abs/2106.09685

Default: 1

--lora_dropout, -lora_dropout

rule of thumb: same value as in main model

Default: 0.0

Reproducibility¶

--seed, -seed

Set random seed used for better reproducibility between experiments.

Default: -1

Initialization¶

--param_init, -param_init

Parameters are initialized over uniform distribution with support (-param_init, param_init). Use 0 to not use initialization

Default: 0.1

--param_init_glorot, -param_init_glorot

Init parameters with xavier_uniform. Required for transformer.

Default: False

--train_from, -train_from

If training from a checkpoint then this is the path to the pretrained model’s state_dict.

Default: “”

--reset_optim, -reset_optim

Possible choices: none, all, states, keep_states

Optimization resetter when train_from.

Default: “none”

--pre_word_vecs_enc, -pre_word_vecs_enc

If a valid path is specified, then this will load pretrained word embeddings on the encoder side. See README for specific formatting instructions.

--pre_word_vecs_dec, -pre_word_vecs_dec

If a valid path is specified, then this will load pretrained word embeddings on the decoder side. See README for specific formatting instructions.

--freeze_word_vecs_enc, -freeze_word_vecs_enc

Freeze word embeddings on the encoder side.

Default: False

--freeze_word_vecs_dec, -freeze_word_vecs_dec

Freeze word embeddings on the decoder side.

Default: False

Optimization- Type¶

--num_workers, -num_workers

pytorch DataLoader num_workers

Default: 2

--batch_size, -batch_size

Maximum batch size for training

Default: 64

--batch_size_multiple, -batch_size_multiple

Batch size multiple for token batches.

Default: 1

--batch_type, -batch_type

Possible choices: sents, tokens

Batch grouping for batch_size. Standard is sents. Tokens will do dynamic batching

Default: “sents”

--normalization, -normalization

Possible choices: sents, tokens

Normalization method of the gradient.

Default: “sents”

--accum_count, -accum_count

Accumulate gradient this many times. Approximately equivalent to updating batch_size * accum_count batches at once. Recommended for Transformer.

Default: [1]

--accum_steps, -accum_steps

Steps at which accum_count values change

Default: [0]

--valid_steps, -valid_steps

Perfom validation every X steps

Default: 10000

--valid_batch_size, -valid_batch_size

Maximum batch size for validation

Default: 32

--train_steps, -train_steps

Number of training steps

Default: 100000

--single_pass, -single_pass

Make a single pass over the training dataset.

Default: False

--early_stopping, -early_stopping

Number of validation steps without improving.

Default: 0

--early_stopping_criteria, -early_stopping_criteria

Criteria to use for early stopping.

--optim, -optim

Possible choices: sgd, adagrad, adadelta, adam, sparseadam, adafactor, fusedadam, adamw8bit, pagedadamw8bit, pagedadamw32bit

Optimization method.

Default: “sgd”

--adagrad_accumulator_init, -adagrad_accumulator_init

Initializes the accumulator values in adagrad. Mirrors the initial_accumulator_value option in the tensorflow adagrad (use 0.1 for their default).

Default: 0

--max_grad_norm, -max_grad_norm

If the norm of the gradient vector exceeds this, renormalize it to have the norm equal to max_grad_norm

Default: 5

--dropout, -dropout

Dropout probability; applied in LSTM stacks.

Default: [0.3]

--attention_dropout, -attention_dropout

Attention Dropout probability.

Default: [0.1]

--dropout_steps, -dropout_steps

Steps at which dropout changes.

Default: [0]

--truncated_decoder, -truncated_decoder

Truncated bptt.

Default: 0

--adam_beta1, -adam_beta1

The beta1 parameter used by Adam. Almost without exception a value of 0.9 is used in the literature, seemingly giving good results, so we would discourage changing this value from the default without due consideration.

Default: 0.9

--adam_beta2, -adam_beta2

The beta2 parameter used by Adam. Typically a value of 0.999 is recommended, as this is the value suggested by the original paper describing Adam, and is also the value adopted in other frameworks such as Tensorflow and Keras, i.e. see: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer or https://keras.io/optimizers/ . Whereas recently the paper “Attention is All You Need” suggested a value of 0.98 for beta2, this parameter may not work well for normal models / default baselines.

Default: 0.999

--label_smoothing, -label_smoothing

Label smoothing value epsilon. Probabilities of all non-true labels will be smoothed by epsilon / (vocab_size - 1). Set to zero to turn off label smoothing. For more detailed information, see: https://arxiv.org/abs/1512.00567

Default: 0.0

--average_decay, -average_decay

Moving average decay. Set to other than 0 (e.g. 1e-4) to activate. Similar to Marian NMT implementation: http://www.aclweb.org/anthology/P18-4020 For more detail on Exponential Moving Average: https://en.wikipedia.org/wiki/Moving_average

Default: 0

--average_every, -average_every

Step for moving average. Default is every update, if -average_decay is set.

Default: 1

Optimization- Rate¶

--learning_rate, -learning_rate

Starting learning rate. Recommended settings: sgd = 1, adagrad = 0.1, adadelta = 1, adam = 0.001

Default: 1.0

--learning_rate_decay, -learning_rate_decay

If update_learning_rate, decay learning rate by this much if steps have gone past start_decay_steps

Default: 0.5

--start_decay_steps, -start_decay_steps

Start decaying every decay_steps after start_decay_steps

Default: 50000

--decay_steps, -decay_steps

Decay every decay_steps

Default: 10000

--decay_method, -decay_method

Possible choices: noam, noamwd, rsqrt, none

Use a custom decay rate.

Default: “none”

--warmup_steps, -warmup_steps

Number of warmup steps for custom decay.

Default: 4000

Logging¶

--log_file, -log_file

Output logs to a file under this path.

Default: “”

--log_file_level, -log_file_level

Possible choices: CRITICAL, ERROR, WARNING, INFO, DEBUG, NOTSET, 50, 40, 30, 20, 10, 0

Default: “0”

--verbose, -verbose

Print data loading and statistics for all process(default only log the first process shard)

Default: False

--valid_metrics, -valid_metrics

List of names of additional validation metrics

Default: []

--scoring_debug, -scoring_debug

Dump the src/ref/pred of the current batch

Default: False

--dump_preds, -dump_preds

Folder to dump predictions to.

--report_every, -report_every

Print stats at this interval.

Default: 50

--exp_host, -exp_host

Send logs to this crayon server.

Default: “”

--exp, -exp

Name of the experiment for logging.

Default: “”

--tensorboard, -tensorboard

Use tensorboard for visualization during training. Must have the library tensorboard >= 1.14.

Default: False

--tensorboard_log_dir, -tensorboard_log_dir

Log directory for Tensorboard. This is also the name of the run.

Default: “runs/onmt”

--override_opts, -override-opts

Allow to override some checkpoint opts

Default: False

Quant options¶

--quant_layers, -quant_layers

list of layers to be compressed in 4/8bit.

Default: []

--quant_type, -quant_type

Possible choices: , bnb_8bit, bnb_FP4, bnb_NF4, awq_gemm, awq_gemv

Type of compression.

Default: “”

--w_bit, -w_bit

Possible choices: 4

W_bit quantization.

Default: 4

--group_size, -group_size

Possible choices: 128

group size quantization.

Default: 128