Source code for opennmt.inputters.record_inputter
"""Define inputters reading from TFRecord files."""
import numpy as np
import tensorflow as tf
from opennmt.data import dataset as dataset_util
from opennmt.inputters.inputter import Inputter
[docs]class SequenceRecordInputter(Inputter):
"""Inputter that reads ``tf.train.SequenceExample``.
See Also:
:func:`opennmt.inputters.create_sequence_records` to generate a compatible
dataset.
"""
[docs] def __init__(self, input_depth, **kwargs):
"""Initializes the parameters of the record inputter.
Args:
input_depth: The depth dimension of the input vectors.
**kwargs: Additional layer keyword arguments.
"""
super().__init__(**kwargs)
self.input_depth = input_depth
[docs] def make_dataset(self, data_file, training=None):
return dataset_util.make_datasets(tf.data.TFRecordDataset, data_file)
[docs] def input_signature(self):
return {
"tensor": tf.TensorSpec([None, None, self.input_depth], self.dtype),
"length": tf.TensorSpec([None], tf.int32),
}
[docs] def make_features(self, element=None, features=None, training=None):
if features is None:
features = {}
if "tensor" in features:
return features
_, feature_lists, lengths = tf.io.parse_sequence_example(
element,
sequence_features={
"values": tf.io.FixedLenSequenceFeature(
[self.input_depth], dtype=tf.float32
)
},
)
tensor = feature_lists["values"]
features["length"] = tf.cast(lengths["values"], tf.int32)
features["tensor"] = tf.cast(tensor, self.dtype)
return features
[docs]def write_sequence_record(vector, writer):
"""Writes a sequence vector as a TFRecord.
Args:
vector: A 2D Numpy float array of shape :math:`[T, D]`.
writer: A ``tf.io.TFRecordWriter``.
See Also:
- :class:`opennmt.inputters.SequenceRecordInputter`
- :func:`opennmt.inputters.create_sequence_records`
"""
feature_list = tf.train.FeatureList(
feature=[
tf.train.Feature(float_list=tf.train.FloatList(value=values))
for values in vector.astype(np.float32)
]
)
feature_lists = tf.train.FeatureLists(feature_list={"values": feature_list})
example = tf.train.SequenceExample(feature_lists=feature_lists)
writer.write(example.SerializeToString())
[docs]def create_sequence_records(vectors, path, compression=None):
"""Creates a TFRecord file of sequence vectors.
Args:
vectors: An iterable of 2D Numpy float arrays of shape :math:`[T, D]`.
path: The output TFRecord file.
compression: Optional compression type, can be "GZIP".
Returns:
Path to the TFRecord file. In most cases this is the same as :obj:`path` but
if GZIP compression is enabled, the ".gz" extension is added if not already
present.
Raises:
ValueError: if :obj:`compression` is invalid.
See Also:
- :class:`opennmt.inputters.SequenceRecordInputter`
- :func:`opennmt.inputters.write_sequence_record`
"""
if compression is not None:
if compression not in ("GZIP",):
raise ValueError("invalid compression type: %s" % compression)
if compression == "GZIP" and not path.endswith(".gz"):
path = "%s.gz" % path
writer = tf.io.TFRecordWriter(path, options=compression)
for vector in vectors:
write_sequence_record(vector, writer)
writer.close()
return path