dataquality.integrations.seq2seq.formatters package#

Submodules#

dataquality.integrations.seq2seq.formatters.alpaca module#

class AlpacaFormatter(name='tatsu-lab/alpaca', input_col='formatted_input', target_col='output', max_train_size=1000, process_batch=False)#

Bases: BaseFormatter

name: str = 'tatsu-lab/alpaca'#
input_col: str = 'formatted_input'#
target_col: str = 'output'#
max_train_size: int = 1000#
property remove_cols: List[str]#
format_sample(sample, idx=None)#

Formats the alpaca dataset for seq2seq

Return type:

Dict[str, str]

Example

>>> sample = {
...     "instruction": "Summarize the following paragraph",
...     "input": "The quick brown fox jumped over the lazy dog.",
...     "target": "The quick brown fox jumped over the lazy dog.",
... }
>>> AlpacaFormatter().format_sample(sample)
{
    "formatted_input": (
        "Human: Summarize the following paragraph "
        "Context: The quick brown fox jumped over the lazy dog."
    )
}

dataquality.integrations.seq2seq.formatters.base module#

class BatchData(batch)#

Bases: object

batch: Dict[str, Any]#
sample_from_idx(batch_idx)#

Gets a subset of the batch

Return type:

Dict[str, Any]

class BaseFormatter(name, input_col, target_col, max_train_size=None, process_batch=False)#

Bases: ABC

name: str#
input_col: str#
target_col: str#
max_train_size: Optional[int] = None#
process_batch: bool = False#
property remove_cols: List[str]#
format_batch(batch, idxs)#

Formats a batch of chat data for seq2seq

Return type:

Dict[str, List]

abstract format_sample(sample, idx=None)#

Must be implemented by subclass

Return type:

Dict[str, Any]

class DefaultFormatter(name='default', input_col='input', target_col='target', max_train_size=None, process_batch=False)#

Bases: BaseFormatter

name: str = 'default'#
input_col: str = 'input'#
target_col: str = 'target'#
format_sample(sample, idx=None)#

Base formatter is identity function

Return type:

Dict[str, Any]

dataquality.integrations.seq2seq.formatters.chat module#

class ChatFormatter(name='chat', input_col='input', target_col='target', max_train_size=None, process_batch=True, turns_col='turns', metadata_col='metadata', content_col='content', role_col='role', user='User', assistant='Chatbot', system='System')#

Bases: BaseFormatter

name: str = 'chat'#
input_col: str = 'input'#
target_col: str = 'target'#
max_train_size: Optional[int] = None#
process_batch: bool = True#
turns_col: str = 'turns'#
metadata_col: str = 'metadata'#
content_col: str = 'content'#
role_col: str = 'role'#
user: str = 'User'#
assistant: str = 'Chatbot'#
system: str = 'System'#
format_sample(sample, idx=None)#

Formats a chat dataset for seq2seq

Takes in a sample with “turns” column and explodes it to have one row per turn.

Return type:

Dict[str, Any]

Example

>>> sample = {
...     "turns": [
...         {"role": "User", "content": "Hello"},
...         {"role": "Chatbot", "content": "Hi"},
...         {"role": "User", "content": "How are you?"},
...         {"role": "Chatbot", "content": "I'm good, how are you?"},
...     ],
...     "metadata": {"unique_id": 1234, "dataset": "test"},
...     "score": 0.5,
... }
>>> ChatFormatter().format_sample(sample, 5)
{
    "chat_id": [5, 5],
    "turn_id": [1, 2],
    "input": ["Hello", "How are you?"],
    "target": ["Hi", "I'm good, how are you?"],
    "unique_id": [1234, 1234],
    "dataset": ["test", "test"],
}
class ChatHistoryFormatter(name='chat', input_col='input', target_col='target', max_train_size=None, process_batch=True, turns_col='turns', metadata_col='metadata', content_col='content', role_col='role', user='User', assistant='Chatbot', system='System', hf_tokenizer='google/flan-t5-base', tokenizer=None, max_input_tokens=512)#

Bases: ChatFormatter

hf_tokenizer: Optional[str] = 'google/flan-t5-base'#
tokenizer: Optional[PreTrainedTokenizerFast] = None#
max_input_tokens: int = 512#
format_sample(sample, idx=None)#

Formats a chat dataset for seq2seq with previous turn history :rtype: Dict[str, Any]

Similar to ChatFormatter, except subsequent turns contain context from previous turns.

Example:
>>> sample = {
...     "turns": [
...         {"role": "User", "content": "Hello"},
...         {"role": "Chatbot", "content": "Hi"},
...         {"role": "User", "content": "How are you?"},
...         {"role": "Chatbot", "content": "I'm good, how are you?"},
...     ],
...     "metadata": {"unique_id": 1234, "dataset": "test"},
...     "score": 0.5,
... }
>>> ChatHistoryFormatter().format_sample(sample, 5)
{
    "chat_id": [5, 5],
    "turn_id": [1, 2],
    "input": ["Hello", "Hello

Hi

How are you?”],

“target”: [“Hi”, “I’m good, how are you?”], “unique_id”: [1234, 1234], “dataset”: [“test”, “test”],

}

Module contents#

get_formatter(name)#

Returns the formatter for the given name

If the name isn’t found, returns the base formatter

Return type:

BaseFormatter