dataquality.loggers.data_logger package#

Subpackages#

Submodules#

dataquality.loggers.data_logger.base_data_logger module#

class BaseGalileoDataLogger(meta=None)#

Bases: BaseGalileoLogger

Base class for data loggers.

A document col is a large str > 1k chars < 10k chars To avoid massive files, we limit the number of documents logged

MAX_META_COLS = 25#
MAX_STR_LEN = 1000#
MAX_DOC_LEN = 10000#
LIMIT_NUM_DOCS = 3#
INPUT_DATA_BASE = 'input_data'#
STRING_MAX_SIZE_B = 1500000000.0#
DATA_FOLDER_EXTENSION = {'data': 'hdf5', 'emb': 'hdf5', 'prob': 'hdf5'}#
INPUT_DATA_FILE_EXT = 'arrow'#
property input_data_path: str#

Return the path to the input data folder.

Example

/Users/username/.galileo/logs/proj-id/run-id/input_data

input_data_file(input_num=None, split=None)#

Return the path to the input data file.

Return type:

str

Example

/Users/username/.galileo/logs/proj-id/run-id/input_data/train/data_0.arrow

abstract log_data_sample(*, text, id, **kwargs)#

Log a single input sample. See child for details

Return type:

None

abstract log_data_samples(*, texts, ids, **kwargs)#

Log a list of input samples. See child for details

Return type:

None

abstract log_dataset(dataset, *, batch_size=100000, text='text', id='id', split=None, meta=None, **kwargs)#

Log a dataset/iterable of input samples.

Provide the dataset and the keys to index into it. See child for details

Return type:

None

validate_ids_for_split(ids)#

Validate ids for the current split

Validates: - that the ids are unique for the current split - that the ids are not already logged for the current split

On success: - adds the ids to the logged_input_ids for the current split

Return type:

None

add_ids_to_split(ids)#
Return type:

None

log()#

Writes input data to disk in .galileo/logs

If input data already exist, append new data to existing input file. If the dataset is very large this function will be called multiple times for a given split.

Return type:

None

export_df(df)#

Export the dataframe and increment the input_data_logged in this helper in order to allow for overrides in child classes.

For instance semseg needs to do this in a multithreaded way and add locks to avoid threading issues

Return type:

None

property support_embs: bool#
property support_data_embs: bool#
apply_column_map(dataset, column_map)#

Rename columns in the dataset according to the column_map

This function works for both pandas and HF datasets

Return type:

TypeVar(DataSet, bound= Union[Iterable, DataFrame, Dataset, DataFrame])

upload(last_epoch=None, create_data_embs=False, data_embs_col='text')#

Iterates through all of each splits children folders [data/emb/prob] for each inference name / epoch, concatenates all of the files with vaex, and uploads them to a single file in minio

If create_data_embs is True, this will also run an off the shelf transformer and upload those text embeddings alongside the models finetuned embeddings

Return type:

None

upload_split(location, split, object_store, last_epoch, create_data_embs, data_embs_col)#
Return type:

None

classmethod create_and_upload_data_embs(df, split, epoch_or_inf, data_embs_col)#

Uploads off the shelf data embeddings for a split

Return type:

None

convert_large_string(df)#

Cast regular string to large_string for the text column

Arrow strings have a max size of 2GB, so in order to export to hdf5 and join the strings in the text column, we upcast to a large string.

We only do this for types that write to HDF5 files

Return type:

DataFrame

upload_split_from_in_frame(object_store, in_frame, split, split_loc, last_epoch, create_data_embs, data_embs_col)#
Return type:

None

create_in_out_frames(in_frame, dir_name, prob_only, split, epoch_or_inf)#

Formats the input data and model output data

In this step, we concatenate the many hdf5 files created during model training and logging. We log those in threaded processes, and here we combine them into a single hdf5 file that vaex can read into a dataframe

Parameters:
  • in_frame (DataFrame) – the input dataframe

  • dir_name (str) – The directory of all of the output hdf5 files

  • prob_only (bool) – If we are only uploading probability data. We only upload probability data for all epochs except the last one (we dont use cross-epoch embeddings currently, so we dont log them)

  • split (str) – The split we are logging for

  • epoch_or_inf (Union[str, int]) – The epoch or inference name we are logging for

Return type:

BaseLoggerDataFrames

classmethod process_in_out_frames(in_frame, out_frame, prob_only, epoch_or_inf_name, split)#

Processes input and output dataframes from logging

Validates uniqueness of IDs in the output dataframe Joins inputs and outputs Splits the dataframes into prob, emb, and data for uploading to minio

Parameters:
  • in_frame (DataFrame) – The input dataframe

  • out_frame (DataFrame) – The model output dataframe

  • prob_only (bool) – If we are only uploading probabilities, or everything

  • epoch_or_inf_name (str) – The epoch or inference name we are uploading for

Return type:

BaseLoggerDataFrames

classmethod upload_in_out_frames(object_store, in_out_frames, split, epoch_or_inf)#
Return type:

None

prob_only(epochs, split, epoch_or_inf_name, last_epoch)#

Determines if we are only uploading probabilities

For all epochs that aren’t the last 2 (early stopping), we only want to upload the probabilities (for DEP calculation).

Return type:

bool

validate_and_format()#

Validates the logger

Ensures that self.split is set, or sets it to the current split from the logger_config.

Each child also defines an additional validate method that is called

Return type:

None

abstract classmethod validate_labels()#
Return type:

None

validate_metadata(batch_size)#
Return type:

None

static get_data_logger_attr(cls)#

Returns the attribute that corresponds to the logger in the class. This assumes only 1 logger object exists in the class

Parameters:

cls (object) – The class

Return type:

str

Returns:

The attribute name

abstract classmethod separate_dataframe(df, prob_only=False, split=None)#
Return type:

BaseLoggerDataFrames

validate_kwargs(kwargs)#

Raises if a function that shouldn’t get kwargs gets any

Return type:

None

classmethod set_tagging_schema(tagging_schema)#

Sets the tagging schema, if applicable. Must be implemented by child

Return type:

None

dataquality.loggers.data_logger.image_classification module#

class ImageClassificationDataLogger(texts=None, labels=None, ids=None, split=None, meta=None, inference_name=None)#

Bases: TextClassificationDataLogger

Create data logger.

Parameters:
  • texts (Optional[List[str]]) – The raw text inputs for model training. List[str]

  • labels (Optional[List[str]]) – the ground truth labels aligned to each text field.

List[str] :type ids: Optional[List[int]] :param ids: Optional unique indexes for each record. If not provided, will default to the index of the record. Optional[List[Union[int,str]]] :type split: Optional[str] :param split: The split for training/test/validation

logger_config: ImageClassificationLoggerConfig = ImageClassificationLoggerConfig(labels=None, tasks=None, observed_num_labels=0, observed_labels=set(), tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False, observed_ids={}, all_ids={})#
DATA_FOLDER_EXTENSION = {'data': 'arrow', 'emb': 'hdf5', 'prob': 'hdf5'}#
property support_data_embs: bool#

Coming soon via CLIP

log_image_dataset(dataset, *, imgs_local_colname=None, imgs_remote=None, batch_size=10000, id='id', label='label', split=None, inference_name=None, meta=None, column_map=None, parallel=False)#

For main docstring see top level method located in core/log.py.

Return type:

Any

convert_large_string(df)#

We override to avoid doing the computation to check if the text is over 2GB

Because this is CV, almost certainly the text will be over the limit, and for really big ones, the computation gets very long (and seems to actually use some memory). We just assume it’s over the limit (which is safe) and export

Return type:

DataFrame

classmethod process_in_out_frames(in_frame, out_frame, prob_only, epoch_or_inf_name, split)#

We have to be careful with joins in the image datasets because of the string encoded images. They are too long and cause arrow offsets

Override base to handle very large strings (the encoded images)

There are a number of bugs (and open PRs) around this issue. PyArrow as a fundamental issue around strings over 2GB in size. They have a special datatype large_string for them, but that type is not robust. See https://issues.apache.org/jira/browse/ARROW-9773 and https://issues.apache.org/jira/browse/ARROW-17828

One such issue is the use of .take with arrays of large_strings. .take is both _not_ memory safe, and causes an ArrayOffSetOverFlow error (pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays) See https://github.com/vaexio/vaex/issues/2335 and https://github.com/huggingface/datasets/issues/615

The solution is to use .slice instead of .take - this creates a zero-memory copy, and does not cause the overflow. See https://github.com/huggingface/datasets/pull/645

The issue is that vaex currently uses .take (because this didn’t used to be an issue) when performing join operations. Because the join in vaex is lazy, the issue doesn’t materialize until exporting. The true solution is for vaex to stop using take (I made a pr: https://github.com/vaexio/vaex/pull/2336)

So we are careful to only join on the columns we need emb: “id”, “emb” prob: “id”, “gold”, “prob” data: “id”, “pred” + all the other cols not in emb or prob

Return type:

BaseLoggerDataFrames

upload_split_from_in_frame(object_store, in_frame, split, split_loc, last_epoch, create_data_embs, data_embs_col)#
Return type:

None

classmethod add_cv_smart_features(in_frame, split)#

Calculate and add smart features on images (blurriness, contrast, etc) to the dataframe.

The in_frame df only requires the column containing the paths to local images GAL_LOCAL_IMAGES_PATHS for this method to run.

Return type:

DataFrame

dataquality.loggers.data_logger.object_detection module#

class GalileoDataLoggerAttributes(value)#

Bases: str, Enum

An enumeration.

image = 'image'#
ids = 'ids'#
split = 'split'#
meta = 'meta'#
static get_valid()#
Return type:

List[str]

class ODCols(value)#

Bases: str, Enum

An enumeration.

image = 'image'#
bbox = 'bbox'#
gold_cls = 'gold_cls'#
id = 'id'#
split = 'split'#
meta = 'meta'#
width = 'width'#
height = 'height'#
class ObjectDetectionDataLogger(images=None, ids=None, split=None, meta=None, inference_name=None, width=None, height=None)#

Bases: BaseGalileoDataLogger

Class for logging input data/data of Object Detection models to Galileo.

logger_config: ObjectDetectionLoggerConfig = ObjectDetectionLoggerConfig(labels=None, tasks=None, observed_num_labels=None, observed_labels=None, tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False, image_cloud_path='', box_format=<BoxFormat.xyxy: 'xyxy'>)#
static get_valid_attributes()#

Returns a list of valid attributes that GalileoModelConfig accepts :rtype: List[str] :return: List[str]

log_dataset(dataset, *, batch_size=100000, image='image', id='id', width='width', height='height', split=None, inference_name=None, meta=None, **kwargs)#

Log a dataset of input samples for OD

Return type:

None

log_image_samples(*, images, ids, width, height, split=None, inference_name=None, meta=None, **kwargs)#

Log input samples for OD

Return type:

None

convert_large_string(df)#

We skip this step because there is no ‘text’ field

Return type:

DataFrame

prob_only(epochs, split, epoch_or_inf_name, last_epoch)#

In OD, theres only 1 epoch, so we want to upload it all, not just probs

Return type:

bool

classmethod create_and_upload_data_embs(df, split, epoch_or_inf, data_embs_col)#

Data embeddings not yet supported for any CV task

Return type:

None

classmethod process_in_out_frames(in_frame, out_frame, prob_only, epoch_or_inf_name, split)#

Process the logged input data and output data into uploadable files

In OD (like NER), we have 2 different ‘levels’ of data: image and box. The in_frame is everything at the image level (image url, metadata etc). The out_frame is the embeddings, probabilities, bboxes, label, etc (see dq.loggers.model_logger._get_data_dict for more details).

We want to upload the image data as the data field, the probabilities and boxes as the probs field, and embeddings as the emb field.

It’s possible that PCA/UMAP have already been applied, so we will check for those columns (via CUDA). If they have, we uplod the PCA embeddings as well as the x, y coordinates of UMAP.

If not, we will preemptively apply PCA to the embeddings before uploading. This is because the embeddings in OD are very large (> 2000dim), and it’s not scalable. Additionally, we run our algorithms based off of the PCA embeddings in the server. So we will always drop the raw embeddings for PCA.

Return type:

BaseLoggerDataFrames

property support_data_embs: bool#

dataquality.loggers.data_logger.semantic_segmentation module#

class SemanticSegmentationDataLogger(texts=None, labels=None, ids=None, split=None, meta=None, inference_name=None)#

Bases: BaseGalileoDataLogger

logger_config: SemanticSegmentationLoggerConfig = SemanticSegmentationLoggerConfig(labels=None, tasks=None, observed_num_labels=None, observed_labels=None, tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False)#
INPUT_DATA_FILE_EXT = 'hdf5'#
log_dataset(dataset, *, batch_size=1000, text='text', image='image', id='id', split=None, inference_name=None, meta=None, **kwargs)#

Log a dataset/iterable of input samples.

Provide the dataset and the keys to index into it. See child for details

Return type:

None

log_image_samples(*, images, ids, split=None, inference_name=None, meta=None, **kwargs)#

Log input samples for semseg

Return type:

None

export_df(df)#

Export the dataframe and increment the input_data_logged in this helper in order to allow for overrides in child classes.

For instance semseg needs to do this in a multithreaded way and add locks to avoid threading issues

Return type:

None

upload_split_from_in_frame(object_store, in_frame, split, split_loc, last_epoch, create_data_embs, data_embs_col)#

Upload image df and polygon df to Minio root bucket

For SemSeg we only have one epoch, the final pass. So for now we hard code 0 in place of last_epoch.

Return type:

None

property support_embs: bool#

Not yet supported for SemSeg. Coming soon!

property support_data_embs: bool#

dataquality.loggers.data_logger.tabular_classification module#

class TabularClassificationDataLogger(model=None, X=None, y=None, feature_names=None, split=None, inference_name=None)#

Bases: BaseGalileoDataLogger

logger_config: TabularClassificationLoggerConfig = TabularClassificationLoggerConfig(labels=None, tasks=None, observed_num_labels=0, observed_labels=set(), tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False, feature_importances={})#
validate_and_prepare_logger()#

Validates the input data before logging to Minio

Return type:

None

Validates:
  • The model has a predict_proba method

  • The model is fit

  • The data is a pandas DataFrame or numpy array

  • If the split is not inf, the labels are a numpy array

  • If the split is not inf, the data and labels are the same length

  • If X is a numpy array, the feature names are provided

  • If X is a numpy array, the number of features in X and feature names

    are the same

  • Feature names match the feature names logged in a prior split

  • Feature names are valid names, no special chars

Sets:
  • self.X to a pandas DataFrame if it is a numpy array

  • self.y to a numpy array if it is a list

  • self.feature_names to the column names of X if it is a pandas DataFrame

  • logger_config.feature_names to the column names of X if they aren’t set

set_probs()#

Sets the probs attribute for the class

Assumes model and dataset are set.

Return type:

None

save_feature_importances()#

Saves feature importances in the DB

Assumes the model is fit

Return type:

None

log()#

Uploads data and probs df to disk in .galileo/logs

Support for batching to come in V1 of tabular data project.

We write the dfs to disk in the following locations: /Users/username/.galileo/logs/proj-id/run-id/training/data/data.hdf5 /Users/username/.galileo/logs/proj-id/run-id/training/prob/prob.hdf5

Return type:

None

upload(last_epoch=None, create_data_embs=False, data_embs_col='text')#

Uploads the data and prob files for a given split to Minio

Pulls files from disk and uploads them to Minio.

Return type:

None

From disk:

/Users/username/.galileo/logs/proj-id/run-id/training/prob.hdf5

To Minio:

bucket/proj-id/run-id/training/prob.hdf5

dataquality.loggers.data_logger.text_classification module#

class GalileoDataLoggerAttributes(value)#

Bases: str, Enum

An enumeration.

texts = 'texts'#
labels = 'labels'#
ids = 'ids'#
split = 'split'#
meta = 'meta'#
inference_name = 'inference_name'#
static get_valid()#
Return type:

List[str]

class TextClassificationDataLogger(texts=None, labels=None, ids=None, split=None, meta=None, inference_name=None)#

Bases: BaseGalileoDataLogger

Class for logging input data/metadata of Text Classification models to Galileo.

  • texts: The raw text inputs for model training. List[str]

  • labels: the ground truth labels aligned to each text field. List[str]

  • ids: Optional unique indexes for each record. If not provided, will default to

the index of the record. Optional[List[int]] * split: The split for training/test/validation

ex: .. code-block:: python

all_labels = [“A”, “B”, “C”] dq.set_labels_for_run(labels = all_labels)

texts: List[str] = [

“Text sample 1”, “Text sample 2”, “Text sample 3”, “Text sample 4”

]

labels: List[str] = [“B”, “C”, “A”, “A”]

ids: List[int] = [0, 1, 2, 3] meta = {“sample_quality”: [5.3, 9.1, 2.7, 5.8]} split = “training”

dq.log_data_samples(texts=texts, labels=labels, ids=ids, meta=meta, split=split)

Create data logger.

Parameters:
  • texts (Optional[List[str]]) – The raw text inputs for model training. List[str]

  • labels (Optional[List[str]]) – the ground truth labels aligned to each text field.

List[str] :type ids: Optional[List[int]] :param ids: Optional unique indexes for each record. If not provided, will default to the index of the record. Optional[List[Union[int,str]]] :type split: Optional[str] :param split: The split for training/test/validation

logger_config: BaseLoggerConfig = TextClassificationLoggerConfig(labels=None, tasks=None, observed_num_labels=0, observed_labels=set(), tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False)#
log_data_samples(*, texts, ids, labels=None, split=None, inference_name=None, meta=None, **kwargs)#

Log input samples for text classification

ex: .. code-block:: python

dq.init(“text_classification”) all_labels = [“A”, “B”, “C”] dq.set_labels_for_run(labels = all_labels)

texts: List[str] = [

“Text sample 1”, “Text sample 2”, “Text sample 3”, “Text sample 4”

]

labels: List[str] = [“B”, “C”, “A”, “A”]

ids: List[int] = [0, 1, 2, 3] split = “training”

dq.log_data_samples(texts=texts, labels=labels, ids=ids, split=split)

Parameters:
  • texts (List[str]) – List[str] text samples

  • ids (List[int]) – List[int | str] IDs for each text sample

  • labels (Optional[List[str]]) – List[str] labels for each text sample. Required if not in inference

  • split (Optional[Split]) – train/test/validation/inference. Can be set here or via dq.set_split

  • inference_name (Optional[str]) – If logging inference data, a name for this inference data is required. Can be set here or via dq.set_split

  • meta (Optional[TypeVar(MetasType, bound= Dict[str, List[Union[str, float, int]]])]) – Dict[str, List[str, int, float]]. Metadata for each text sample Format is the {“metadata_field_name”: [metdata value per sample]}

Return type:

None

log_data_sample(*, text, id, label=None, split=None, inference_name=None, meta=None, **kwargs)#

Log a single input sample for text classification

Parameters:
  • text (str) – str the text sample

  • id (int) – The sample ID

  • label (Optional[str]) – str label for the sample. Required if not in inference

  • split (Optional[Split]) – train/test/validation/inference. Can be set here or via dq.set_split

  • inference_name (Optional[str]) – If logging inference data, a name for this inference data is required. Can be set here or via dq.set_split

  • meta (Optional[TypeVar(MetaType, bound= Dict[str, Union[str, float, int]])]) – Dict[str, Union[str, int, float]]. Metadata for the text sample Format is the {“metadata_field_name”: metadata_field_value}

Return type:

None

log_dataset(dataset, *, batch_size=100000, text='text', id='id', label='label', split=None, inference_name=None, meta=None, **kwargs)#

Log a dataset of input samples for text classification

Parameters:
  • dataset (TypeVar(DataSet, bound= Union[Iterable, DataFrame, Dataset, DataFrame])) – The dataset to log. This can be an python iterable or Pandas/Vaex dataframe. If an iterable, it can be a list of elements that can be indexed into either via int index (tuple/list) or string/key index (dict)

  • batch_size (int) – Number of samples to log in a batch. Default 100,000

  • text (Union[str, int]) – The key/index of the text fields

  • id (Union[str, int]) – The key/index of the id fields

  • label (Union[str, int, None]) – The key/index of the label fields

  • split (Optional[Split]) – train/test/validation/inference. Can be set here or via dq.set_split

  • inference_name (Optional[str]) – If logging inference data, a name for this inference data is required. Can be set here or via dq.set_split

  • meta (Union[List[str], List[int], None]) – List[str, int]: The keys/indexes of each metadata field. Consider a pandas dataframe, this would be the list of columns corresponding to each metadata field to log

Return type:

None

static get_valid_attributes()#

Returns a list of valid attributes that this logger accepts :rtype: List[str] :return: List[str]

validate_and_format()#

Validates that the current config is correct. * Text and Labels must both exist (unless split is ‘inference’ in which case labels must be None) * Text and Labels must be the same length * If ids exist, it must be the same length as text/labels

If the user logged labels as ints, convert them to the string labels. In the next optimization, we will support the API having int labels, but for now it expects string labels. When we make that change, we will do the opposite and always convert to the int index of the labels.

Return type:

None

validate_logged_labels()#

Validates that the labels logged match the labels set

Return type:

None

classmethod separate_dataframe(df, prob_only=True, split=None)#

Separates the singular dataframe into its 3 components

Gets the probability df, the embedding df, and the “data” df containing all other columns

Return type:

BaseLoggerDataFrames

classmethod validate_labels()#
Return type:

None

dataquality.loggers.data_logger.text_multi_label module#

class TextMultiLabelDataLogger(texts=None, labels=None, ids=None, split=None, meta=None)#

Bases: TextClassificationDataLogger

Class for logging input data/metadata of Text Multi Label models to Galileo.

  • texts: The raw text inputs for model training. List[str]

  • task_labels: the list of ground truth labels aligned to each text field. Each text

field input must have the same number of labels (which must be the number of tasks) List[List[str]] * ids: Optional unique indexes for each record. If not provided, will default to the index of the record. Optional[List[int]] * split: The split for training/test/validation

task_labels = [["A", "B", "C"], ["Foo", "Bar"], ["Apple", "Orange", "Grape"]]
tasks = ["Task_0", "Task_1", "Task_2"]
dq.init("text_multi_label")
dq.set_tasks_for_run(tasks)
dq.set_labels_for_run(labels = task_labels)

texts: List[str] = [
    "Text sample 1",
    "Text sample 2",
    "Text sample 3",
]

task_labels: List[str] = [
    ["A", "Foo", "Grape"],
    ["C", "Foo", "Apple"],
    ["B", "Bar", "Orange"]
]

ids: List[int] = [0, 1, 2]
meta = {"sample_quality": [5.3, 9.1, 2.7]}
split = "training"

dq.log_data_samples(
    texts=texts, task_labels=task_labels, ids=ids, meta=meta, split=split
)

Create data logger.

Parameters:
  • text – The raw text inputs for model training. List[str]

  • labels (Optional[List[List[str]]]) – the ground truth labels aligned to each text field.

List[List[str]] :type ids: Optional[List[int]] :param ids: Optional unique indexes for each record. If not provided, will default to the index of the record. Optional[List[Union[int,str]]] :type split: Optional[str] :param split: The split for training/test/validation

logger_config: TextMultiLabelLoggerConfig = TextMultiLabelLoggerConfig(labels=None, tasks=None, observed_num_labels=None, observed_labels=defaultdict(<class 'set'>, {}), tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False, observed_num_tasks=0, binary=True)#
log_data_sample(*, text, id, label=None, split=None, inference_name=None, meta=None, task_labels=None, **kwargs)#

Log a single input sample for text multi-label :type text: str :param text: str the text sample :type id: int :param id: The sample ID :type split: Optional[Split] :param split: train/test/validation/inference. Can be set here or via

dq.set_split

Parameters:
  • meta (Optional[TypeVar(MetaType, bound= Dict[str, Union[str, float, int]])]) – Dict[str, Union[str, int, float]]. Metadata for the text sample Format is the {“metadata_field_name”: metadata_field_value}

  • task_labels (Optional[List[str]]) – List[str] The label of each task for this sample Required if split is not inference

Return type:

None

log_data_samples(*, texts, ids, labels=None, split=None, inference_name=None, meta=None, task_labels=None, **kwargs)#

Log input samples for text multi-label

Parameters:
  • texts (List[str]) – List[str] text samples

  • ids (List[int]) – List[int,str] IDs for each text sample

  • split (Optional[Split]) – train/test/validation/inference. Can be set here or via dq.set_split

  • meta (Optional[TypeVar(MetasType, bound= Dict[str, List[Union[str, float, int]]])]) – Dict[str, List[str, int, float]]. Metadata for each text sample Format is the {“metadata_field_name”: [metdata value per sample]}

  • task_labels (Optional[List[List[str]]]) – List[List[str]] list of labels for each task for each text sample. Required if not in inference

Return type:

None

validate_and_format()#

Parent validation (text_classification) with additional validation on labels

in multi_label modeling, each element in self.labels should itself be a list

Return type:

None

validate_logged_labels()#

Validates that the labels logged match the labels set

Return type:

None

classmethod validate_labels()#
Return type:

None

dataquality.loggers.data_logger.text_ner module#

class GalileoDataLoggerAttributes(value)#

Bases: str, Enum

An enumeration.

texts = 'texts'#
text_token_indices = 'text_token_indices'#
text_token_indices_flat = 'text_token_indices_flat'#
gold_spans = 'gold_spans'#
ids = 'ids'#
split = 'split'#
meta = 'meta'#
static get_valid()#
Return type:

List[str]

class TextNERDataLogger(texts=None, text_token_indices=None, gold_spans=None, ids=None, split=None, meta=None, inference_name=None)#

Bases: BaseGalileoDataLogger

Class for logging input data/metadata of Text NER models to Galileo.

  • text: The raw text inputs for model training. List[str]

  • text_token_indices: Token boundaries of text. List[List[Tuple(int, int)]].

Used to convert the gold_spans into token level spans internally. For each sample, the boundary of a token will contain the start and end character index of word in the text to which the said token belongs.

  • gold_spans: Gold spans for the text at character level indices.

The list of spans in a sample with their start and end indexes, and the label. Indexes start at 0 and are [inclusive, exclusive) for [start, end) respectively. List[List[dict]].

  • ids: Optional unique indexes for each record. If not provided, will default to

the index of the record. Optional[List[int]]

  • split: The split of training/test/validation

  • meta: Dict[str, List]. Any metadata information you want to log at a per sample

(text input) level. This could be a string (len <= 50), a float or an int. Each sample can have up to 50 meta fields.

# number of samples in the list must be the same length as the number of text # samples logged Format {“sample_importance”: [0.2, 0.5, 0.99, …]}

ex: .. code-block:: python

labels = [“B-PER”, “I-PER”, “B-LOC”, “I-LOC”, “O”] dq.set_labels_for_run(labels = labels)

# One of (IOB2, BIO, IOB, BILOU, BILOES) dq.set_tagging_schema(tagging_schema: str = “BIO”)

texts: List[str] = [

“The president is Joe Biden”, “Joe Biden addressed the United States on Monday”

]

gold_spans: List[List[dict]] = [
[

{“start”:17, “end”:27, “label”:”person”} # “Joe Biden”

], [

{“start”:0, “end”:10, “label”:”person”}, # “Joe Biden” {“start”:30, “end”:41, “label”:”location”} # “United States”

]

]

text_token_indices: [[(0, 3), (4, 13), (14, 16), (17, 20), (21, 27), (21, 27)],

[…]]

ids: List[int] = [0, 1] meta = {“sample_quality”: [5.3, 1.1]} split = “training”

dq.log_data_samples(

texts=texts, text_token_indices=text_token_indices, gold_spans=gold_spans, meta=meta, ids=ids, split=split

)

Create data logger.

Parameters:
  • texts (Optional[List[str]]) – The raw text inputs for model training. List[str]

  • text_token_indices (Optional[List[List[Tuple[int, int]]]]) – Token boundaries of text. List[Tuple(int, int)].

Used to convert the gold_spans into token level spans internally. t[0] indicates the start index of the span and t[1] is the end index (exclusive) :type gold_spans: Optional[List[List[Dict]]] :param gold_spans: The model-level gold spans over the char index of text :type ids: Optional[List[int]] :param ids: Optional unique indexes for each record. If not provided, will default to the index of the record. Optional[List[Union[int,str]]] :type split: Optional[str] :param split: The split for training/test/validation

DATA_FOLDER_EXTENSION = {'data': 'arrow', 'emb': 'hdf5', 'prob': 'hdf5'}#
logger_config: BaseLoggerConfig = TextNERLoggerConfig(labels=None, tasks=None, observed_num_labels=None, observed_labels=None, tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False, gold_spans={}, sample_length={})#
static get_valid_attributes()#

Returns a list of valid attributes that GalileoModelConfig accepts :rtype: List[str] :return: List[str]

log_data_samples(*, texts, ids, text_token_indices=None, gold_spans=None, split=None, inference_name=None, meta=None, **kwargs)#

Log input samples for text NER

Parameters:
  • texts (List[str]) – List[str] text samples

  • ids (List[int]) – List[int,str] IDs for each text sample

  • text_token_indices (Optional[List[List[Tuple[int, int]]]]) – List[List[Tuple(int, int)]]. Token boundaries of each text sample, 1 list per sample. Used to convert the gold_spans into token level spans internally. t[0] indicates the start index of the span and t[1] is the end index (exclusive). Required if split is not inference

  • gold_spans (Optional[List[List[Dict]]]) – List[List[Dict]] The model-level gold spans over the char index for each text sample. 1 List[Dict] per text sample. “start”, “end”, “label” are the required keys Required if split is not inference

  • ids – Optional unique indexes for each record. If not provided, will

  • split (Optional[Split]) – train/test/validation/inference. Can be set here or via dq.set_split

  • inference_name (Optional[str]) – If logging inference data, a name for this inference data is required. Can be set here or via dq.set_split

  • meta (Optional[TypeVar(MetasType, bound= Dict[str, List[Union[str, float, int]]])]) – Dict[str, List[str, int, float]]. Metadata for each text sample Format is the {“metadata_field_name”: [metdata value per sample]}

Return type:

None

log_data_sample(*, text, id, text_token_indices=None, gold_spans=None, split=None, inference_name=None, meta=None, **kwargs)#

Log a single input sample for text classification

Parameters:
  • text (str) – str the text sample

  • id (int) – The sample ID

  • text_token_indices (Optional[List[Tuple[int, int]]]) – List[Tuple(int, int)]. Token boundaries of the text sample. Used to convert gold_spans into token level spans internally. t[0] indicates the start index of the span and t[1] is the end index (exclusive). Required if split is not inference

  • gold_spans (Optional[List[Dict]]) – List[Dict] The model-level gold spans over the char index of the text sample. “start”, “end”, “label” are the required keys Required if split is not inference

  • split (Optional[Split]) – train/test/validation/inference. Can be set here or via dq.set_split

  • meta (Optional[TypeVar(MetaType, bound= Dict[str, Union[str, float, int]])]) – Dict[str, Union[str, int, float]]. Metadata for the text sample Format is the {“metadata_field_name”: metadata_field_value}

Return type:

None

log_dataset(dataset, *, batch_size=100000, text='text', id='id', text_token_indices='text_token_indices', gold_spans='gold_spans', split=None, inference_name=None, meta=None, **kwargs)#

Log a dataset of input samples for NER

Parameters:
  • dataset (TypeVar(DataSet, bound= Union[Iterable, DataFrame, Dataset, DataFrame])) – The dataset to log. This can be an python iterable or Pandas/Vaex dataframe. If an iterable, it can be a list of elements that can be indexed into either via int index (tuple/list) or string/key index (dict)

  • batch_size (int) – Number of samples to log in a batch. Default 100,000

  • text (Union[str, int]) – The key/index of the text fields

  • id (Union[str, int]) – The key/index of the id fields

  • text_token_indices (Union[str, int]) – The key/index of the sample text_token_indices

  • gold_spans (Union[str, int]) – The key/index of the sample gold_spans

  • split (Optional[Split]) – train/test/validation/inference. Can be set here or via dq.set_split

  • meta (Union[List[str], List[int], None]) – List[str, int]: The keys/indexes of each metadata field. Consider a pandas dataframe, this would be the list of columns corresponding to each metadata field to log

Return type:

None

validate_and_format()#

Validates that the current config is correct. * Text and Labels must both exist (unless split is ‘inference’ in which case gold_spans must be None) * Text and Labels must be the same length * If ids exist, it must be the same length as text/labels

Return type:

None

classmethod process_in_out_frames(in_frame, out_frame, prob_only, epoch_or_inf_name, split)#

Processes input and output dataframes from logging

NER is a different case where the input data is logged at the sample level, but output data is logged at the span level, so we need to process it differently

We don’t have span IDs so we don’t need to validate uniqueness We don’t join the input and output frames We do need to split take only the rows from in_frame from this split Splits the dataframes into prob, emb, and input data for uploading to minio

Return type:

BaseLoggerDataFrames

classmethod separate_dataframe(df, prob_only=True, split=None)#

Splits the dataframe into logical grouping for minio storage

NER is a different case, where we store the text samples as “data” and all of the span level data is split into only “emb” and “prob”. This function will only return 2 modified dataframes, where the third is expected to be the input data logged by the user

Return type:

BaseLoggerDataFrames

classmethod validate_labels()#

Validates and cleans labels, see _clean_labels and saves ner_labels

ner_labels are all of the labels that start with a tag (B-, I-, E- etc) as well as the O tag

Return type:

None

classmethod is_valid_span_label(label)#

Denotes if a span label is valid based on our allowed tagging schemas

B = Before the sequence I = In the sequence L/E = Last/Ending character of the sequence S/U = Single/Unit element of a sequence

A valid span label would then start with ‘B-’ for example.

Return type:

bool

classmethod set_tagging_schema(tagging_schema)#

Sets the tagging schema, if applicable. Must be implemented by child

Return type:

None

property support_data_embs: bool#

Not yet supported for NER. Coming soon!

classmethod create_and_upload_data_embs(df, split, epoch_or_inf, data_embs_col)#

Not yet supported for NER. Coming soon!

Return type:

None

Module contents#

class BaseGalileoDataLogger(meta=None)#

Bases: BaseGalileoLogger

Base class for data loggers.

A document col is a large str > 1k chars < 10k chars To avoid massive files, we limit the number of documents logged

MAX_META_COLS = 25#
MAX_STR_LEN = 1000#
MAX_DOC_LEN = 10000#
LIMIT_NUM_DOCS = 3#
INPUT_DATA_BASE = 'input_data'#
STRING_MAX_SIZE_B = 1500000000.0#
DATA_FOLDER_EXTENSION = {'data': 'hdf5', 'emb': 'hdf5', 'prob': 'hdf5'}#
INPUT_DATA_FILE_EXT = 'arrow'#
meta: Dict#
property input_data_path: str#

Return the path to the input data folder.

Example

/Users/username/.galileo/logs/proj-id/run-id/input_data

input_data_file(input_num=None, split=None)#

Return the path to the input data file.

Return type:

str

Example

/Users/username/.galileo/logs/proj-id/run-id/input_data/train/data_0.arrow

abstract log_data_sample(*, text, id, **kwargs)#

Log a single input sample. See child for details

Return type:

None

abstract log_data_samples(*, texts, ids, **kwargs)#

Log a list of input samples. See child for details

Return type:

None

abstract log_dataset(dataset, *, batch_size=100000, text='text', id='id', split=None, meta=None, **kwargs)#

Log a dataset/iterable of input samples.

Provide the dataset and the keys to index into it. See child for details

Return type:

None

validate_ids_for_split(ids)#

Validate ids for the current split

Validates: - that the ids are unique for the current split - that the ids are not already logged for the current split

On success: - adds the ids to the logged_input_ids for the current split

Return type:

None

add_ids_to_split(ids)#
Return type:

None

log()#

Writes input data to disk in .galileo/logs

If input data already exist, append new data to existing input file. If the dataset is very large this function will be called multiple times for a given split.

Return type:

None

export_df(df)#

Export the dataframe and increment the input_data_logged in this helper in order to allow for overrides in child classes.

For instance semseg needs to do this in a multithreaded way and add locks to avoid threading issues

Return type:

None

property support_embs: bool#
property support_data_embs: bool#
apply_column_map(dataset, column_map)#

Rename columns in the dataset according to the column_map

This function works for both pandas and HF datasets

Return type:

TypeVar(DataSet, bound= Union[Iterable, DataFrame, Dataset, DataFrame])

upload(last_epoch=None, create_data_embs=False, data_embs_col='text')#

Iterates through all of each splits children folders [data/emb/prob] for each inference name / epoch, concatenates all of the files with vaex, and uploads them to a single file in minio

If create_data_embs is True, this will also run an off the shelf transformer and upload those text embeddings alongside the models finetuned embeddings

Return type:

None

upload_split(location, split, object_store, last_epoch, create_data_embs, data_embs_col)#
Return type:

None

classmethod create_and_upload_data_embs(df, split, epoch_or_inf, data_embs_col)#

Uploads off the shelf data embeddings for a split

Return type:

None

convert_large_string(df)#

Cast regular string to large_string for the text column

Arrow strings have a max size of 2GB, so in order to export to hdf5 and join the strings in the text column, we upcast to a large string.

We only do this for types that write to HDF5 files

Return type:

DataFrame

upload_split_from_in_frame(object_store, in_frame, split, split_loc, last_epoch, create_data_embs, data_embs_col)#
Return type:

None

create_in_out_frames(in_frame, dir_name, prob_only, split, epoch_or_inf)#

Formats the input data and model output data

In this step, we concatenate the many hdf5 files created during model training and logging. We log those in threaded processes, and here we combine them into a single hdf5 file that vaex can read into a dataframe

Parameters:
  • in_frame (DataFrame) – the input dataframe

  • dir_name (str) – The directory of all of the output hdf5 files

  • prob_only (bool) – If we are only uploading probability data. We only upload probability data for all epochs except the last one (we dont use cross-epoch embeddings currently, so we dont log them)

  • split (str) – The split we are logging for

  • epoch_or_inf (Union[str, int]) – The epoch or inference name we are logging for

Return type:

BaseLoggerDataFrames

classmethod process_in_out_frames(in_frame, out_frame, prob_only, epoch_or_inf_name, split)#

Processes input and output dataframes from logging

Validates uniqueness of IDs in the output dataframe Joins inputs and outputs Splits the dataframes into prob, emb, and data for uploading to minio

Parameters:
  • in_frame (DataFrame) – The input dataframe

  • out_frame (DataFrame) – The model output dataframe

  • prob_only (bool) – If we are only uploading probabilities, or everything

  • epoch_or_inf_name (str) – The epoch or inference name we are uploading for

Return type:

BaseLoggerDataFrames

classmethod upload_in_out_frames(object_store, in_out_frames, split, epoch_or_inf)#
Return type:

None

prob_only(epochs, split, epoch_or_inf_name, last_epoch)#

Determines if we are only uploading probabilities

For all epochs that aren’t the last 2 (early stopping), we only want to upload the probabilities (for DEP calculation).

Return type:

bool

validate_and_format()#

Validates the logger

Ensures that self.split is set, or sets it to the current split from the logger_config.

Each child also defines an additional validate method that is called

Return type:

None

abstract classmethod validate_labels()#
Return type:

None

validate_metadata(batch_size)#
Return type:

None

static get_data_logger_attr(cls)#

Returns the attribute that corresponds to the logger in the class. This assumes only 1 logger object exists in the class

Parameters:

cls (object) – The class

Return type:

str

Returns:

The attribute name

abstract classmethod separate_dataframe(df, prob_only=False, split=None)#
Return type:

BaseLoggerDataFrames

validate_kwargs(kwargs)#

Raises if a function that shouldn’t get kwargs gets any

Return type:

None

classmethod set_tagging_schema(tagging_schema)#

Sets the tagging schema, if applicable. Must be implemented by child

Return type:

None

split: Optional[str]#
inference_name: Optional[str]#