Source code for opennmt.data.text

"""Text manipulation."""

import tensorflow as tf


[docs]def tokens_to_chars(tokens):
    """Splits tokens into unicode characters.

    Example:

      >>> opennmt.data.tokens_to_chars(["hello", "world"])
      <tf.RaggedTensor [[b'h', b'e', b'l', b'l', b'o'], [b'w', b'o', b'r', b'l', b'd']]>

    Args:
      tokens: A string ``tf.Tensor`` of shape :math:`[T]`.

    Returns:
      The characters as a 2D string ``tf.RaggedTensor``.
    """
    return tf.strings.unicode_split(tokens, "UTF-8")


[docs]def tokens_to_words(tokens, subword_token="￭", is_spacer=None):
    """Converts a sequence of tokens to a sequence of words.

    Example:

      >>> opennmt.data.tokens_to_words(["He@@", "llo", "W@@", "orld", "@@!"], subword_token="@@")
      <tf.RaggedTensor [[b'He@@', b'llo'], [b'W@@', b'orld', b'@@!']]>

    Args:
      tokens: A 1D string ``tf.Tensor``.
      subword_token: The special token used by the subword tokenizer.
      is_spacer: Whether :obj:`subword_token` is used as a spacer (as in
        SentencePiece) or a joiner (as in BPE). If ``None``, will infer
        directly from :obj:`subword_token`.

    Returns:
      The words as a 2D string ``tf.RaggedTensor``.
    """
    if is_spacer is None:
        is_spacer = subword_token == "▁"
    if is_spacer:
        # First token implicitly starts with a spacer.
        left_and_single = tf.logical_or(
            tf.strings.regex_full_match(tokens, "%s.*" % subword_token),
            tf.one_hot(0, tf.shape(tokens)[0], on_value=True, off_value=False),
        )
        right = tf.strings.regex_full_match(tokens, ".+%s" % subword_token)
        word_start = tf.logical_or(tf.roll(right, shift=1, axis=0), left_and_single)
    else:
        right = tf.strings.regex_full_match(tokens, ".*%s" % subword_token)
        left = tf.strings.regex_full_match(tokens, "%s.*" % subword_token)
        subword = tf.logical_or(tf.roll(right, shift=1, axis=0), left)
        word_start = tf.logical_not(subword)
    start_indices = tf.squeeze(tf.where(word_start), -1)
    return tf.RaggedTensor.from_row_starts(tokens, start_indices)


[docs]def alignment_matrix_from_pharaoh(
    alignment_line, source_length, target_length, dtype=tf.float32
):
    """Parse Pharaoh alignments into an alignment matrix.

    Example:

      >>> opennmt.data.alignment_matrix_from_pharaoh("0-0 1-2 1-3 2-1", 3, 4)
      <tf.Tensor: shape=(4, 3), dtype=float32, numpy=
      array([[1., 0., 0.],
             [0., 0., 1.],
             [0., 1., 0.],
             [0., 1., 0.]], dtype=float32)>

    Args:
      alignment_line: A string ``tf.Tensor`` in the Pharaoh format.
      source_length: The length of the source sentence, without special symbols.
      target_length: The length of the target sentence, without special symbols.
      dtype: The output matrix dtype. Defaults to ``tf.float32`` for convenience
        when computing the guided alignment loss.

    Returns:
      The alignment matrix as a 2-D ``tf.Tensor`` of type :obj:`dtype` and shape
      ``[target_length, source_length]``, where ``[i, j] = 1`` if the ``i`` th
      target word is aligned with the ``j`` th source word.
    """
    align_pairs_str = tf.strings.split([alignment_line]).values
    align_pairs_flat_str = tf.strings.split(align_pairs_str, sep="-").values
    align_pairs_flat = tf.strings.to_number(align_pairs_flat_str, out_type=tf.int64)
    sparse_indices = tf.reshape(align_pairs_flat, [-1, 2])
    sparse_values = tf.ones([tf.shape(sparse_indices)[0]], dtype=dtype)
    source_length = tf.cast(source_length, tf.int64)
    target_length = tf.cast(target_length, tf.int64)
    maximum_ids = tf.reduce_max(sparse_indices, axis=0)
    assert_source_length = _assert_in_range(
        maximum_ids[0], source_length, alignment_line, "source"
    )
    assert_target_length = _assert_in_range(
        maximum_ids[1], target_length, alignment_line, "target"
    )
    with tf.control_dependencies([assert_source_length, assert_target_length]):
        alignment_matrix_sparse = tf.sparse.SparseTensor(
            sparse_indices, sparse_values, [source_length, target_length]
        )
        alignment_matrix = tf.sparse.to_dense(
            alignment_matrix_sparse, validate_indices=False
        )
        return tf.transpose(alignment_matrix)


def _assert_in_range(maximum_id, length, line, name):
    return tf.debugging.assert_less(
        maximum_id,
        length,
        message=tf.strings.format(
            "Length mismatch for alignment line {}: actual %s length is {}, but "
            "got %s id {} which is out of range. Please check that the alignment "
            "file is correctly aligned to the training file." % (name, name),
            [line, length, maximum_id],
        ),
    )