dataquality.clients package#

Submodules#

dataquality.clients.api module#

class ApiClient#

Bases: object

get_token()#
Return type:

str

make_request(request, url, body=None, data=None, params=None, header=None, timeout=None, files=None, return_response_without_validation=False)#

Makes an HTTP request.

This is the center point of all functions and the main entry/exit for the dataquality client to interact with the server.

Return type:

Any

get_current_user()#
Return type:

Dict

valid_current_user()#
Return type:

bool

get_project(project_id)#
Return type:

Dict

get_projects()#
Return type:

List[Dict]

get_project_by_name(project_name)#
Return type:

Dict

get_project_runs(project_id)#

Gets all runs from a project by ID

Return type:

List[Dict]

get_project_runs_by_name(project_name)#

Gets all runs from a project by name

Return type:

List[Dict]

get_project_run(project_id, run_id)#

Gets a run in a project by ID

Return type:

Dict

get_project_run_by_name(project_name, run_name)#
Return type:

Dict

update_run_name(project_name, run_name, new_name)#
Return type:

Dict

update_project_name(project_name, new_name)#
Return type:

Dict

create_project(project_name)#

Creates a project given a name and returns the project information

Return type:

Dict

create_run(project_name, run_name, task_type)#

Creates a run in a given project

Return type:

Dict

reset_run(project_id, run_id, task_type=None)#

Resets a run by deleting the run with that name and creating a new one with the same name, getting a new UUID

Called before any call to dataquality.finish if prior data was logged. see dataquality.finish

Return type:

None

delete_run(project_id, run_id)#

Deletes a run

This clears all metadata about the run, all object data, and the run itself

Return type:

Dict

delete_run_by_name(project_name, run_name)#

Deletes a run via name

This clears all metadata about the run, all object data, and the run itself

Return type:

None

delete_project(project_id)#

Deletes a project

For each run in the project, this clears all metadata about the run, all object data, and the run itself

Return type:

Dict

delete_project_by_name(project_name)#

Deletes a project by name

For each run in the project, this clears all metadata about the run, all object data, and the run itself

Return type:

None

get_labels_for_run(project_name=None, run_name=None, task=None)#

Gets the labels for a given run, else the currently initialized project/run

If you do not provide a project and run name, the currently initialized project/run will be used. Otherwise you must provide both a project and run name If the run is a multi-label run, a task must be provided

Return type:

List[str]

get_tasks_for_run(project_name=None, run_name=None)#

Gets the task names for a given multi-label run,

If you do not provide a project and run name, the currently initialized project/run will be used. Otherwise you must provide both a project and run name

This function is only valid for multi-label runs.

Return type:

List[str]

get_epochs_for_run(project_name, run_name, split)#

Returns an ordered list of epochs for a run

Return type:

List[int]

create_edit(edit)#
Return type:

Dict

reprocess_run(project_name=None, run_name=None, labels=None, xray=False)#

Removed. Please see dq.internal.reprocess_run

Return type:

Dict

get_slice_by_name(project_name, slice_name)#

Get a slice by name

Return type:

Dict

get_metadata_columns(project_name, run_name, split)#

Lists the available metadata columns for a run/split

Return type:

Dict

Structure of data is: [{

“name”: str “is_categorical”: bool “unique_values”: Optional[List] “max”: Optional[float] “min”: Optional[float]

},…] :type project_name: str :param project_name: :type run_name: str :param run_name: :type split: str :param split:

get_task_type(project_id, run_id)#
Return type:

TaskType

export_run(project_name, run_name, split, file_name, inference_name='', slice_name=None, include_cols=None, col_mapping=None, hf_format=False, tagging_schema=None, filter_params=None)#

Export a project/run to disk as a file

Parameters:
  • project_name (str) – The project name

  • run_name (str) – The run name

  • split (str) – The split to export on

  • file_name (str) – The file name. Must end in a supported FileType

  • inference_name (str) – Required if split is inference. The name of the inference

  • slice_name (Optional[str]) – The optional slice name to export. If selected, this data

Return type:

None split to get data for.

from this slice will be exported only. :type include_cols: Optional[List[str]] :param include_cols: List of columns to include in the export. If not set, all columns will be exported. If “*” is included, return all metadata columns :type col_mapping: Optional[Dict[str, str]] :param col_mapping: Dictionary of renamed column names for export. :type hf_format: bool :param hf_format: (NER only)

Whether to export the dataframe in a HuggingFace compatible format

Parameters:
  • tagging_schema (Optional[TaggingSchema]) – (NER only) If hf_format is True, you must pass a tagging schema

  • filter_params (Optional[Dict]) – Filters to apply to the dataframe before exporting. Only rows with matching filters will be included in the exported data

get_project_run_name(project_id=None, run_id=None)#

Gets the project/run name given project/run IDs, or based on the config’s

Current project and run IDs

Return type:

Tuple[str, str]

get_run_status(project_name=None, run_name=None)#
Return type:

Dict[str, Any]

Return type:

str

wait_for_run(project_name=None, run_name=None)#
Return type:

None

get_presigned_url(method, bucket_name, object_name, project_id)#
Return type:

str

get_run_summary(project_name, run_name, split, task=None, inference_name=None, filter_params=None)#

Gets overall run summary, or summary of a filtered subset.

Use filter_params to apply arbitrary filters on the dataframe, based on the filter schema: https://api.dev.rungalileo.io/redoc#tag/insights

Return type:

Dict

get_run_metrics(project_name, run_name, split, task=None, inference_name=None, category='gold', filter_params=None)#
Return type:

Dict[str, List]

get_column_distribution(project_name, run_name, split, task=None, inference_name=None, column='data_error_potential', filter_params=None)#
Return type:

Dict[str, List]

get_alerts(project_name, run_name, split, inference_name=None)#

Queries API for alerts for a run/split

Return type:

List[Dict[str, str]]

delete_alerts_for_split(project_id, run_id, split)#
Return type:

None

delete_alerts(project_name, run_name)#

Delete all alerts for a run

Return type:

None

get_edits(project_name, run_name, split, inference_name=None)#

Gets all edits for a run/split

Return type:

List

export_edits(project_name, run_name, split, file_name, inference_name=None, include_cols=None, col_mapping=None, hf_format=False, tagging_schema=None)#

Export the edits of a project/run/split to disk as a file

Parameters:
  • project_name (str) – The project name

  • run_name (str) – The run name

  • split (str) – The split to export on

  • file_name (str) – The file name. Must end in a supported FileType

  • inference_name (Optional[str]) – Required if split is inference. The name of the inference

  • include_cols (Optional[List[str]]) – List of columns to include in the export. If not set,

Return type:

None split to get data for.

all columns will be exported. :type col_mapping: Optional[Dict[str, str]] :param col_mapping: Dictionary of renamed column names for export. :type hf_format: bool :param hf_format: (NER only)

Whether to export the dataframe in a HuggingFace compatible format

Parameters:
  • tagging_schema (Optional[TaggingSchema]) – (NER only) If hf_format is True, you must pass a tagging schema

  • filter_params – Filters to apply to the dataframe before exporting. Only

rows with matching filters will be included in the exported data. If a slice

notify_email(data, template, emails=None)#
Return type:

None

get_splits(project_id, run_id)#
Return type:

Dict

get_inference_names(project_id, run_id)#
Return type:

Dict

set_metric_for_run(project_id, run_id, data)#
Return type:

Dict

get_healthcheck_dq()#
Return type:

Dict

upload_file_for_project(project_id, file_path, export_format, export_cols, bucket)#
Return type:

Any

get_presigned_url_for_model(project_id, run_id, model_kind, model_parameters)#

Returns a presigned url for uploading a model to S3

Return type:

str

get_uploaded_model_info(project_id, run_id)#

Returns information about the model for a given run. Will also update the status to complete. :type project_id: UUID :param project_id: The project id :type run_id: UUID :param run_id: The run id

Return type:

Any

dataquality.clients.objectstore module#

class ObjectStore#

Bases: object

DOWNLOAD_CHUNK_SIZE_MB = 256#
create_minio_client_for_exoscale_cluster()#
Return type:

Any

create_object(object_name, file_path, content_type='application/octet-stream', progress=True, bucket_name=None)#
Return type:

None

create_project_run_object_from_df(df, object_name, bucket_name=None)#

Uploads a Vaex dataframe at the specified object_name location

Return type:

None

download_file(object_name, file_path, bucket=None)#
Parameters:
  • object_name (str) – The object name.

  • file_path (str) – Where to write the object data locally.

  • bucket (Optional[str]) – The bucket name. If None, the root bucket name is used.

Returns:

The local file where the object name was written.

Return type:

str

Module contents#