dataquality.loggers package#

Subpackages#

Submodules#

dataquality.loggers.base_logger module#

class BaseLoggerAttributes(value)#

Bases: str, Enum

A collection of all default attributes across all loggers

texts = 'texts'#

labels = 'labels'#

ids = 'ids'#

split = 'split'#

meta = 'meta'#

prob = 'prob'#

gold_conf_prob = 'gold_conf_prob'#

gold_loss_prob = 'gold_loss_prob'#

gold_loss_prob_label = 'gold_loss_prob_label'#

pred_conf_prob = 'pred_conf_prob'#

pred_loss_prob = 'pred_loss_prob'#

pred_loss_prob_label = 'pred_loss_prob_label'#

gold = 'gold'#

embs = 'embs'#

probs = 'probs'#

logits = 'logits'#

epoch = 'epoch'#

aum = 'aum'#

text_tokenized = 'text_tokenized'#

gold_spans = 'gold_spans'#

pred_emb = 'pred_emb'#

gold_emb = 'gold_emb'#

pred_spans = 'pred_spans'#

text_token_indices = 'text_token_indices'#

text_token_indices_flat = 'text_token_indices_flat'#

log_helper_data = 'log_helper_data'#

inference_name = 'inference_name'#

image = 'image'#

token_label_str = 'token_label_str'#

token_label_positions = 'token_label_positions'#

token_label_offsets = 'token_label_offsets'#

label = 'label'#

token_deps = 'token_deps'#

text = 'text'#

id = 'id'#

token_gold_probs = 'token_gold_probs'#

tokenized_label = 'tokenized_label'#

input = 'input'#

target = 'target'#

generated_output = 'generated_output'#

input_cutoff = 'input_cutoff'#

target_cutoff = 'target_cutoff'#

system_prompts = 'system_prompts'#

x = 'x'#

y = 'y'#

data_x = 'data_x'#

data_y = 'data_y'#

static get_valid()#

Return type:: List[str]

class BaseGalileoLogger#

Bases: object

An abstract base class that all model logger and data loggers inherit from

LOG_FILE_DIR = '/home/runner/.galileo/logs'#

logger_config: BaseLoggerConfig = BaseLoggerConfig(labels=None, tasks=None, observed_num_labels=None, observed_labels=None, tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False)#

property proj_run: str#

Returns the project and run id

Example

proj-id/run-id

property write_output_dir: str#

Returns the path to the output directory for the current run

Example

/Users/username/.galileo/logs/proj-id/run-id

property split_name: str#

Returns the name of the current split

If the split is inference, it will return the name of the inference concatenated to the end of the split name

Example

training inference_inf-name1

property split_name_path: str#

Returns the path part of the current split

If the split is inference, it will return the name of the inference run after the split name

Example

training inference/inf-name1

static get_valid_attributes()#

Return type:: List[str]

abstract validate_and_format()#

Validates params passed in during logging. Implemented by child

Return type:: None

set_split_epoch()#

Sets the split for the current logger

If the split is not set, it will use the split set in the logger config

Return type:: None

is_valid()#

Return type:: bool

classmethod non_inference_logged()#

Return true if training, test, or validation data is logged

If just inference data is logged then append data rather than overwriting. This flag is also used by the api to know which processing jobs to run.

Return type:: bool

abstract log()#

Return type:: None

static validate_task(task_type)#

Raises error if task type is not a valid TaskType

Return type:: TaskType

upload()#

Return type:: None

classmethod get_all_subclasses()#

Return type:: List[Type[TypeVar(T, bound= BaseGalileoLogger)]]

classmethod get_logger(task_type)#

Return type:: Type[TypeVar(T, bound= BaseGalileoLogger)]

classmethod doc()#

Return type:: None

classmethod validate_split(split)#

Raises error if split is not a valid Split

Return type:: str

classmethod check_for_logging_failures()#

When a threaded logging call fails, it sets the logger_config.exception

If that field is set, raise an exception here and stop the main process

Return type:: None

classmethod is_hf_dataset(df)#

Return type:: bool

property label_idx_map: Dict[str, int]#

Convert a list of labels to a dictionary of label to index

Example:#

>>> labels = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC"]
>>> label_idx_map(labels)
{"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4}

labels_to_idx(gold_sequence)#

Convert a list of labels to a np array of indices

Return type:: ndarray

Example:#

# labels = [“O”, “B-PER”, “I-PER”, “B-LOC”, “I-LOC”] >>> gold_sequence = [“O”, “B-LOC”, “B-PER”, “I-PER”, “O”] >>> labels_to_idx(gold_sequence) [0, 3, 1, 2, 0]

Module contents#

class BaseGalileoLogger#

Bases: object

An abstract base class that all model logger and data loggers inherit from

LOG_FILE_DIR = '/home/runner/.galileo/logs'#

logger_config: BaseLoggerConfig = BaseLoggerConfig(labels=None, tasks=None, observed_num_labels=None, observed_labels=None, tagging_schema=None, last_epoch=0, cur_epoch=None, cur_split=None, cur_inference_name=None, training_logged=False, validation_logged=False, test_logged=False, inference_logged=False, exception='', helper_data={}, input_data_logged=defaultdict(<class 'int'>, {}), logged_input_ids=defaultdict(<class 'set'>, {}), idx_to_id_map=defaultdict(<class 'list'>, {}), conditions=[], report_emails=[], ner_labels=[], int_labels=False, feature_names=[], metadata_documents=set(), finish=<function BaseLoggerConfig.<lambda>>, existing_run=False, dataloader_random_sampling=False, remove_embs=False)#

split: Optional[str]#

inference_name: Optional[str]#

property proj_run: str#

Returns the project and run id

Example

proj-id/run-id

property write_output_dir: str#

Returns the path to the output directory for the current run

Example

/Users/username/.galileo/logs/proj-id/run-id

property split_name: str#

Returns the name of the current split

If the split is inference, it will return the name of the inference concatenated to the end of the split name

Example

training inference_inf-name1

property split_name_path: str#

Returns the path part of the current split

If the split is inference, it will return the name of the inference run after the split name

Example

training inference/inf-name1

static get_valid_attributes()#

Return type:: List[str]

abstract validate_and_format()#

Validates params passed in during logging. Implemented by child

Return type:: None

set_split_epoch()#

Sets the split for the current logger

If the split is not set, it will use the split set in the logger config

Return type:: None

is_valid()#

Return type:: bool

classmethod non_inference_logged()#

Return true if training, test, or validation data is logged

If just inference data is logged then append data rather than overwriting. This flag is also used by the api to know which processing jobs to run.

Return type:: bool

abstract log()#

Return type:: None

static validate_task(task_type)#

Raises error if task type is not a valid TaskType

Return type:: TaskType

upload()#

Return type:: None

classmethod get_all_subclasses()#

Return type:: List[Type[TypeVar(T, bound= BaseGalileoLogger)]]

classmethod get_logger(task_type)#

Return type:: Type[TypeVar(T, bound= BaseGalileoLogger)]

classmethod doc()#

Return type:: None

classmethod validate_split(split)#

Raises error if split is not a valid Split

Return type:: str

classmethod check_for_logging_failures()#

When a threaded logging call fails, it sets the logger_config.exception

If that field is set, raise an exception here and stop the main process

Return type:: None

classmethod is_hf_dataset(df)#

Return type:: bool

property label_idx_map: Dict[str, int]#

Convert a list of labels to a dictionary of label to index

Example:#

>>> labels = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC"]
>>> label_idx_map(labels)
{"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4}

labels_to_idx(gold_sequence)#

Convert a list of labels to a np array of indices

Return type:: ndarray

Example:#

# labels = [“O”, “B-PER”, “I-PER”, “B-LOC”, “I-LOC”] >>> gold_sequence = [“O”, “B-LOC”, “B-PER”, “I-PER”, “O”] >>> labels_to_idx(gold_sequence) [0, 3, 1, 2, 0]