usage: [-h] [-config CONFIG] [-save_config SAVE_CONFIG]
                [--src_word_vec_size SRC_WORD_VEC_SIZE]
                [--tgt_word_vec_size TGT_WORD_VEC_SIZE]
                [--word_vec_size WORD_VEC_SIZE] [--share_decoder_embeddings]
                [--share_embeddings] [--position_encoding]
                [--feat_merge {concat,sum,mlp}]
                [--feat_vec_size FEAT_VEC_SIZE]
                [--feat_vec_exponent FEAT_VEC_EXPONENT]
                [--model_type {text,img,audio,vec}]
                [--model_dtype {fp32,fp16}]
                [--encoder_type {rnn,brnn,ggnn,mean,transformer,cnn}]
                [--decoder_type {rnn,transformer,cnn}] [--layers LAYERS]
                [--enc_layers ENC_LAYERS] [--dec_layers DEC_LAYERS]
                [--rnn_size RNN_SIZE] [--enc_rnn_size ENC_RNN_SIZE]
                [--dec_rnn_size DEC_RNN_SIZE]
                [--audio_enc_pooling AUDIO_ENC_POOLING]
                [--cnn_kernel_width CNN_KERNEL_WIDTH]
                [--input_feed INPUT_FEED] [--bridge]
                [--rnn_type {LSTM,GRU,SRU}] [--brnn]
                [--context_gate {source,target,both}]
                [--bridge_extra_node BRIDGE_EXTRA_NODE]
                [--bidir_edges BIDIR_EDGES] [--state_dim STATE_DIM]
                [--n_edge_types N_EDGE_TYPES] [--n_node N_NODE]
                [--n_steps N_STEPS] [--src_vocab SRC_VOCAB]
                [--global_attention {dot,general,mlp,none}]
                [--global_attention_function {softmax,sparsemax}]
                [--self_attn_type SELF_ATTN_TYPE]
                [--max_relative_positions MAX_RELATIVE_POSITIONS]
                [--heads HEADS] [--transformer_ff TRANSFORMER_FF]
                [--aan_useffn] [--lambda_align LAMBDA_ALIGN]
                [--alignment_layer ALIGNMENT_LAYER]
                [--alignment_heads ALIGNMENT_HEADS] [--full_context_alignment]
                [--copy_attn] [--copy_attn_type {dot,general,mlp,none}]
                [--generator_function {softmax,sparsemax}] [--copy_attn_force]
                [--reuse_copy_attn] [--copy_loss_by_seqlength]
                [--coverage_attn] [--lambda_coverage LAMBDA_COVERAGE]
                [--loss_scale LOSS_SCALE] [--apex_opt_level {O0,O1,O2,O3}]
                --data DATA [--data_ids DATA_IDS [DATA_IDS ...]]
                [--data_weights DATA_WEIGHTS [DATA_WEIGHTS ...]]
                [--data_to_noise DATA_TO_NOISE [DATA_TO_NOISE ...]]
                [--save_model SAVE_MODEL]
                [--save_checkpoint_steps SAVE_CHECKPOINT_STEPS]
                [--keep_checkpoint KEEP_CHECKPOINT]
                [--gpuid [GPUID [GPUID ...]]]
                [--gpu_ranks [GPU_RANKS [GPU_RANKS ...]]]
                [--world_size WORLD_SIZE] [--gpu_backend GPU_BACKEND]
                [--gpu_verbose_level GPU_VERBOSE_LEVEL]
                [--master_ip MASTER_IP] [--master_port MASTER_PORT]
                [--queue_size QUEUE_SIZE] [--seed SEED]
                [--param_init PARAM_INIT] [--param_init_glorot]
                [--train_from TRAIN_FROM]
                [--reset_optim {none,all,states,keep_states}]
                [--pre_word_vecs_enc PRE_WORD_VECS_ENC]
                [--pre_word_vecs_dec PRE_WORD_VECS_DEC] [--fix_word_vecs_enc]
                [--fix_word_vecs_dec] [--batch_size BATCH_SIZE]
                [--batch_size_multiple BATCH_SIZE_MULTIPLE]
                [--batch_type {sents,tokens}] [--pool_factor POOL_FACTOR]
                [--normalization {sents,tokens}]
                [--accum_count ACCUM_COUNT [ACCUM_COUNT ...]]
                [--accum_steps ACCUM_STEPS [ACCUM_STEPS ...]]
                [--valid_steps VALID_STEPS]
                [--valid_batch_size VALID_BATCH_SIZE]
                [--max_generator_batches MAX_GENERATOR_BATCHES]
                [--train_steps TRAIN_STEPS] [--single_pass] [--epochs EPOCHS]
                [--early_stopping EARLY_STOPPING]
                [--early_stopping_criteria [EARLY_STOPPING_CRITERIA [EARLY_STOPPING_CRITERIA ...]]]
                [--optim {sgd,adagrad,adadelta,adam,sparseadam,adafactor,fusedadam}]
                [--adagrad_accumulator_init ADAGRAD_ACCUMULATOR_INIT]
                [--max_grad_norm MAX_GRAD_NORM]
                [--dropout DROPOUT [DROPOUT ...]]
                [--attention_dropout ATTENTION_DROPOUT [ATTENTION_DROPOUT ...]]
                [--dropout_steps DROPOUT_STEPS [DROPOUT_STEPS ...]]
                [--truncated_decoder TRUNCATED_DECODER]
                [--adam_beta1 ADAM_BETA1] [--adam_beta2 ADAM_BETA2]
                [--label_smoothing LABEL_SMOOTHING]
                [--average_decay AVERAGE_DECAY]
                [--average_every AVERAGE_EVERY]
                [--src_noise {sen_shuffling,infilling,mask} [{sen_shuffling,infilling,mask} ...]]
                [--src_noise_prob SRC_NOISE_PROB [SRC_NOISE_PROB ...]]
                [--learning_rate LEARNING_RATE]
                [--learning_rate_decay LEARNING_RATE_DECAY]
                [--start_decay_steps START_DECAY_STEPS]
                [--decay_steps DECAY_STEPS]
                [--decay_method {noam,noamwd,rsqrt,none}]
                [--warmup_steps WARMUP_STEPS] [--report_every REPORT_EVERY]
                [--log_file LOG_FILE]
                [--log_file_level {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET,50,40,30,20,10,0}]
                [--exp_host EXP_HOST] [--exp EXP] [--tensorboard]
                [--tensorboard_log_dir TENSORBOARD_LOG_DIR]
                [--sample_rate SAMPLE_RATE] [--window_size WINDOW_SIZE]
                [--image_channel_size {3,1}]

Named Arguments

-config, --config

config file path

-save_config, --save_config

config file save path


--src_word_vec_size, -src_word_vec_size

Word embedding size for src.

Default: 500

--tgt_word_vec_size, -tgt_word_vec_size

Word embedding size for tgt.

Default: 500

--word_vec_size, -word_vec_size

Word embedding size for src and tgt.

Default: -1

--share_decoder_embeddings, -share_decoder_embeddings

Use a shared weight matrix for the input and output word embeddings in the decoder.

Default: False

--share_embeddings, -share_embeddings

Share the word embeddings between encoder and decoder. Need to use shared dictionary for this option.

Default: False

--position_encoding, -position_encoding

Use a sin to mark relative words positions. Necessary for non-RNN style models.

Default: False

Model-Embedding Features

--feat_merge, -feat_merge

Possible choices: concat, sum, mlp

Merge action for incorporating features embeddings. Options [concat|sum|mlp].

Default: “concat”

--feat_vec_size, -feat_vec_size

If specified, feature embedding sizes will be set to this. Otherwise, feat_vec_exponent will be used.

Default: -1

--feat_vec_exponent, -feat_vec_exponent

If -feat_merge_size is not set, feature embedding sizes will be set to N^feat_vec_exponent where N is the number of values the feature takes.

Default: 0.7

Model- Encoder-Decoder

--model_type, -model_type

Possible choices: text, img, audio, vec

Type of source model to use. Allows the system to incorporate non-text inputs. Options are [text|img|audio|vec].

Default: “text”

--model_dtype, -model_dtype

Possible choices: fp32, fp16

Data type of the model.

Default: “fp32”

--encoder_type, -encoder_type

Possible choices: rnn, brnn, ggnn, mean, transformer, cnn

Type of encoder layer to use. Non-RNN layers are experimental. Options are [rnn|brnn|ggnn|mean|transformer|cnn].

Default: “rnn”

--decoder_type, -decoder_type

Possible choices: rnn, transformer, cnn

Type of decoder layer to use. Non-RNN layers are experimental. Options are [rnn|transformer|cnn].

Default: “rnn”

--layers, -layers

Number of layers in enc/dec.

Default: -1

--enc_layers, -enc_layers

Number of layers in the encoder

Default: 2

--dec_layers, -dec_layers

Number of layers in the decoder

Default: 2

--rnn_size, -rnn_size

Size of rnn hidden states. Overwrites enc_rnn_size and dec_rnn_size

Default: -1

--enc_rnn_size, -enc_rnn_size

Size of encoder rnn hidden states. Must be equal to dec_rnn_size except for speech-to-text.

Default: 500

--dec_rnn_size, -dec_rnn_size

Size of decoder rnn hidden states. Must be equal to enc_rnn_size except for speech-to-text.

Default: 500

--audio_enc_pooling, -audio_enc_pooling

The amount of pooling of audio encoder, either the same amount of pooling across all layers indicated by a single number, or different amounts of pooling per layer separated by comma.

Default: “1”

--cnn_kernel_width, -cnn_kernel_width

Size of windows in the cnn, the kernel_size is (cnn_kernel_width, 1) in conv layer

Default: 3

--input_feed, -input_feed

Feed the context vector at each time step as additional input (via concatenation with the word embeddings) to the decoder.

Default: 1

--bridge, -bridge

Have an additional layer between the last encoder state and the first decoder state

Default: False

--rnn_type, -rnn_type

Possible choices: LSTM, GRU, SRU

The gate type to use in the RNNs

Default: “LSTM”

--brnn, -brnn

Deprecated, use encoder_type.

--context_gate, -context_gate

Possible choices: source, target, both

Type of context gate to use. Do not select for no context gate.

--bridge_extra_node, -bridge_extra_node

Graph encoder bridges only extra node to decoder as input

Default: True

--bidir_edges, -bidir_edges

Graph encoder autogenerates bidirectional edges

Default: True

--state_dim, -state_dim

Number of state dimensions in the graph encoder

Default: 512

--n_edge_types, -n_edge_types

Number of edge types in the graph encoder

Default: 2

--n_node, -n_node

Number of nodes in the graph encoder

Default: 2

--n_steps, -n_steps

Number of steps to advance graph encoder

Default: 2

--src_vocab, -src_vocab

Path to an existing source vocabulary. Format: one word per line.

Default: “”

Model- Attention

--global_attention, -global_attention

Possible choices: dot, general, mlp, none

The attention type to use: dotprod or general (Luong) or MLP (Bahdanau)

Default: “general”

--global_attention_function, -global_attention_function

Possible choices: softmax, sparsemax

Default: “softmax”

--self_attn_type, -self_attn_type

Self attention type in Transformer decoder layer – currently “scaled-dot” or “average”

Default: “scaled-dot”

--max_relative_positions, -max_relative_positions

Maximum distance between inputs in relative positions representations. For more detailed information, see:

Default: 0

--heads, -heads

Number of heads for transformer self-attention

Default: 8

--transformer_ff, -transformer_ff

Size of hidden transformer feed-forward

Default: 2048

--aan_useffn, -aan_useffn

Turn on the FFN layer in the AAN decoder

Default: False

Model - Alignement

--lambda_align, -lambda_align

Lambda value for alignement loss of Garg et al (2019)For more detailed information, see:

Default: 0.0

--alignment_layer, -alignment_layer

Layer number which has to be supervised.

Default: -3

--alignment_heads, -alignment_heads
  1. of cross attention heads per layer to supervised with

Default: 0

--full_context_alignment, -full_context_alignment

Whether alignment is conditioned on full target context.

Default: False


--copy_attn, -copy_attn

Train copy attention layer.

Default: False

--copy_attn_type, -copy_attn_type

Possible choices: dot, general, mlp, none

The copy attention type to use. Leave as None to use the same as -global_attention.

--generator_function, -generator_function

Possible choices: softmax, sparsemax

Which function to use for generating probabilities over the target vocabulary (choices: softmax, sparsemax)

Default: “softmax”

--copy_attn_force, -copy_attn_force

When available, train to copy.

Default: False

--reuse_copy_attn, -reuse_copy_attn

Reuse standard attention for copy

Default: False

--copy_loss_by_seqlength, -copy_loss_by_seqlength

Divide copy loss by length of sequence

Default: False

--coverage_attn, -coverage_attn

Train a coverage attention layer.

Default: False

--lambda_coverage, -lambda_coverage

Lambda value for coverage loss of See et al (2017)

Default: 0.0

--loss_scale, -loss_scale

For FP16 training, the static loss scale to use. If not set, the loss scale is dynamically computed.

Default: 0

--apex_opt_level, -apex_opt_level

Possible choices: O0, O1, O2, O3

For FP16 training, the opt_level to use.See

Default: “O1”


--data, -data

Path prefix to the “” and “” file path from

--data_ids, -data_ids

In case there are several corpora.

Default: [None]

--data_weights, -data_weights
Weights of different corpora,

should follow the same order as in -data_ids.

Default: [1]

--data_to_noise, -data_to_noise

IDs of datasets on which to apply noise.

Default: []

--save_model, -save_model

Model filename (the model will be saved as <save_model> where N is the number of steps

Default: “model”

--save_checkpoint_steps, -save_checkpoint_steps

Save a checkpoint every X steps

Default: 5000

--keep_checkpoint, -keep_checkpoint

Keep X checkpoints (negative: keep all)

Default: -1

--gpuid, -gpuid

Deprecated see world_size and gpu_ranks.

Default: []

--gpu_ranks, -gpu_ranks

list of ranks of each process.

Default: []

--world_size, -world_size

total number of distributed processes.

Default: 1

--gpu_backend, -gpu_backend

Type of torch distributed backend

Default: “nccl”

--gpu_verbose_level, -gpu_verbose_level

Gives more info on each process per GPU.

Default: 0

--master_ip, -master_ip

IP of master for torch.distributed training.

Default: “localhost”

--master_port, -master_port

Port of master for torch.distributed training.

Default: 10000

--queue_size, -queue_size

Size of queue for each process in producer/consumer

Default: 40

--seed, -seed

Random seed used for the experiments reproducibility.

Default: -1


--param_init, -param_init

Parameters are initialized over uniform distribution with support (-param_init, param_init). Use 0 to not use initialization

Default: 0.1

--param_init_glorot, -param_init_glorot

Init parameters with xavier_uniform. Required for transformer.

Default: False

--train_from, -train_from

If training from a checkpoint then this is the path to the pretrained model’s state_dict.

Default: “”

--reset_optim, -reset_optim

Possible choices: none, all, states, keep_states

Optimization resetter when train_from.

Default: “none”

--pre_word_vecs_enc, -pre_word_vecs_enc

If a valid path is specified, then this will load pretrained word embeddings on the encoder side. See README for specific formatting instructions.

--pre_word_vecs_dec, -pre_word_vecs_dec

If a valid path is specified, then this will load pretrained word embeddings on the decoder side. See README for specific formatting instructions.

--fix_word_vecs_enc, -fix_word_vecs_enc

Fix word embeddings on the encoder side.

Default: False

--fix_word_vecs_dec, -fix_word_vecs_dec

Fix word embeddings on the decoder side.

Default: False

Optimization- Type

--batch_size, -batch_size

Maximum batch size for training

Default: 64

--batch_size_multiple, -batch_size_multiple

Batch size multiple for token batches.

--batch_type, -batch_type

Possible choices: sents, tokens

Batch grouping for batch_size. Standard is sents. Tokens will do dynamic batching

Default: “sents”

--pool_factor, -pool_factor
Factor used in data loading and batch creations.

It will load the equivalent of pool_factor batches, sort them by the according sort_key to produce homogeneous batches and reduce padding, and yield the produced batches in a shuffled way. Inspired by torchtext’s pool mechanism.

Default: 8192

--normalization, -normalization

Possible choices: sents, tokens

Normalization method of the gradient.

Default: “sents”

--accum_count, -accum_count

Accumulate gradient this many times. Approximately equivalent to updating batch_size * accum_count batches at once. Recommended for Transformer.

Default: [1]

--accum_steps, -accum_steps

Steps at which accum_count values change

Default: [0]

--valid_steps, -valid_steps

Perfom validation every X steps

Default: 10000

--valid_batch_size, -valid_batch_size

Maximum batch size for validation

Default: 32

--max_generator_batches, -max_generator_batches

Maximum batches of words in a sequence to run the generator on in parallel. Higher is faster, but uses more memory. Set to 0 to disable.

Default: 32

--train_steps, -train_steps

Number of training steps

Default: 100000

--single_pass, -single_pass

Make a single pass over the training dataset.

Default: False

--epochs, -epochs

Deprecated epochs see train_steps

Default: 0

--early_stopping, -early_stopping

Number of validation steps without improving.

Default: 0

--early_stopping_criteria, -early_stopping_criteria

Criteria to use for early stopping.

--optim, -optim

Possible choices: sgd, adagrad, adadelta, adam, sparseadam, adafactor, fusedadam

Optimization method.

Default: “sgd”

--adagrad_accumulator_init, -adagrad_accumulator_init

Initializes the accumulator values in adagrad. Mirrors the initial_accumulator_value option in the tensorflow adagrad (use 0.1 for their default).

Default: 0

--max_grad_norm, -max_grad_norm

If the norm of the gradient vector exceeds this, renormalize it to have the norm equal to max_grad_norm

Default: 5

--dropout, -dropout

Dropout probability; applied in LSTM stacks.

Default: [0.3]

--attention_dropout, -attention_dropout

Attention Dropout probability.

Default: [0.1]

--dropout_steps, -dropout_steps

Steps at which dropout changes.

Default: [0]

--truncated_decoder, -truncated_decoder

Truncated bptt.

Default: 0

--adam_beta1, -adam_beta1

The beta1 parameter used by Adam. Almost without exception a value of 0.9 is used in the literature, seemingly giving good results, so we would discourage changing this value from the default without due consideration.

Default: 0.9

--adam_beta2, -adam_beta2

The beta2 parameter used by Adam. Typically a value of 0.999 is recommended, as this is the value suggested by the original paper describing Adam, and is also the value adopted in other frameworks such as Tensorflow and Keras, i.e. see: or . Whereas recently the paper “Attention is All You Need” suggested a value of 0.98 for beta2, this parameter may not work well for normal models / default baselines.

Default: 0.999

--label_smoothing, -label_smoothing

Label smoothing value epsilon. Probabilities of all non-true labels will be smoothed by epsilon / (vocab_size - 1). Set to zero to turn off label smoothing. For more detailed information, see:

Default: 0.0

--average_decay, -average_decay

Moving average decay. Set to other than 0 (e.g. 1e-4) to activate. Similar to Marian NMT implementation: For more detail on Exponential Moving Average:

Default: 0

--average_every, -average_every

Step for moving average. Default is every update, if -average_decay is set.

Default: 1

--src_noise, -src_noise

Possible choices: sen_shuffling, infilling, mask

Default: []

--src_noise_prob, -src_noise_prob

Probabilities of src_noise functions

Default: []

Optimization- Rate

--learning_rate, -learning_rate

Starting learning rate. Recommended settings: sgd = 1, adagrad = 0.1, adadelta = 1, adam = 0.001

Default: 1.0

--learning_rate_decay, -learning_rate_decay

If update_learning_rate, decay learning rate by this much if steps have gone past start_decay_steps

Default: 0.5

--start_decay_steps, -start_decay_steps

Start decaying every decay_steps after start_decay_steps

Default: 50000

--decay_steps, -decay_steps

Decay every decay_steps

Default: 10000

--decay_method, -decay_method

Possible choices: noam, noamwd, rsqrt, none

Use a custom decay rate.

Default: “none”

--warmup_steps, -warmup_steps

Number of warmup steps for custom decay.

Default: 4000


--report_every, -report_every

Print stats at this interval.

Default: 50

--log_file, -log_file

Output logs to a file under this path.

Default: “”

--log_file_level, -log_file_level

Possible choices: CRITICAL, ERROR, WARNING, INFO, DEBUG, NOTSET, 50, 40, 30, 20, 10, 0

Default: “0”

--exp_host, -exp_host

Send logs to this crayon server.

Default: “”

--exp, -exp

Name of the experiment for logging.

Default: “”

--tensorboard, -tensorboard

Use tensorboard for visualization during training. Must have the library tensorboard >= 1.14.

Default: False

--tensorboard_log_dir, -tensorboard_log_dir

Log directory for Tensorboard. This is also the name of the run.

Default: “runs/onmt”


--sample_rate, -sample_rate

Sample rate.

Default: 16000

--window_size, -window_size

Window size for spectrogram in seconds.

Default: 0.02

--image_channel_size, -image_channel_size

Possible choices: 3, 1

Using grayscale image can training model faster and smaller

Default: 3