gigl.src.common.utils.bq#

Attributes#

Classes#

Module Contents#

class gigl.src.common.utils.bq.BqUtils(project=None)[source]#
Parameters:

project (Optional[str])

check_columns_exist_in_table(bq_table, columns)[source]#
Parameters:
  • bq_table (str)

  • columns (Iterable[str])

Return type:

None

count_number_of_columns_in_bq_table(bq_table)[source]#
Parameters:

bq_table (str)

Return type:

int

count_number_of_rows_in_bq_table(bq_table, labels={})[source]#
Parameters:
  • bq_table (str)

  • labels (Dict[str, str])

Return type:

int

create_bq_dataset(dataset_id, exists_ok=True)[source]#
Return type:

None

create_or_empty_bq_table(bq_path, schema=None)[source]#
Parameters:
  • bq_path (str)

  • schema (Optional[List[google.cloud.bigquery.SchemaField]])

Return type:

None

delete_bq_table_if_exist(bq_table_path, not_found_ok=True)[source]#

bq_table_path = ‘your-project.your_dataset.your_table’

Parameters:
  • bq_table_path (str)

  • not_found_ok (bool)

Return type:

None

delete_matching_tables(bq_dataset_path, table_match_string)[source]#
Parameters:
  • bq_dataset_path (str)

  • table_match_string (str)

Return type:

None

does_bq_table_exist(bq_table_path)[source]#
Parameters:

bq_table_path (str)

Return type:

bool

export_to_gcs(bq_table_path, destination_gcs_uri, destination_format='NEWLINE_DELIMITED_JSON')[source]#

Export a BigQuery table to Google Cloud Storage.

Parameters:
  • bq_table_path (str) – The full BigQuery table path to export.

  • destination_gcs_uri (str) – The destination GCS URI where the table will be exported. If the gcs uri has * in it, the table will be exported to multiple shards.

  • destination_format (str, optional) – The format of the exported data. Defaults to ‘NEWLINE_DELIMITED_JSON’. ‘CSV’, ‘AVRO’, ‘PARQUET’ also available.

Return type:

None

fetch_bq_table_schema(bq_table)[source]#

Create a dictionary representation for SchemaFields from BigQuery table.

Parameters:

bq_table (str)

Return type:

Dict[str, google.cloud.bigquery.SchemaField]

static format_bq_path(bq_path, format_for_table_reference=False)[source]#

Formats BQ paths.

Parameters:
  • bq_path (str) – expected to be one of: “<project>.<dataset>.<table>” or “<project>:<dataset>.<table>” “<project>.<dataset>” or “<project>:<dataset>” “<dataset>.<table>”

  • format_for_table_reference (bool, optional) – If project, dataset, and

  • ` (table are all specified; add the) – ` seperator between project and dataset.

  • for (Useful for when "table_reference" is required instead of path i.e.)

  • pipelines. (using BigQuery IO operator for beam)

  • False. (Defaults to)

Returns:

Formatted bq path

Return type:

str

get_dataset_name_from_table(bq_path)[source]#
Parameters:

bq_path (str)

Return type:

str

get_table_names_within_date_range(bq_dataset_path, table_match_string, start_date, end_date)[source]#

start_date and end_date are in the format of ‘YYYYMMDD’ table_match_string is a regex string to match table names

Parameters:
  • bq_dataset_path (str)

  • table_match_string (str)

  • start_date (str)

  • end_date (str)

Return type:

List[str]

static join_path(path, *paths)[source]#
Parameters:

path (str)

Return type:

str

list_matching_tables(bq_dataset_path, table_match_string)[source]#
Parameters:
  • bq_dataset_path (str)

  • table_match_string (str)

Return type:

List[str]

load_file_to_bq(source_path, bq_path, job_config, retry=False)[source]#

Uploads a single file to biqquery.

Parameters:
  • source_path (Uri) – The source file to upload.

  • bq_path (str) – The BigQuery table path to upload to.

  • job_config (bigquery.LoadJobConfig) – The job configuration for the upload.

  • retry (bool, optional) – Whether to retry the upload if it fails. Defaults to False.

Return type:

google.cloud.bigquery.job._AsyncJob

Returns: The job object for the upload.

load_rows_to_bq(bq_path, schema, rows)[source]#
Parameters:
  • bq_path (str)

  • schema (List[google.cloud.bigquery.SchemaField])

  • rows (Iterable[Tuple])

Return type:

None

static parse_bq_table_path(bq_table_path)[source]#

Parses a joined bq table path into its project, dataset, and table names :param bq_table_path: Joined bq table path of format project.dataset.table :type bq_table_path: str

Returns:

Parsed BQ Project ID bq_dataset_id (str): Parsed Dataset ID bq_table_name (str): Parsed Table Name

Return type:

bq_project_id (str)

Parameters:

bq_table_path (str)

run_query(query, labels, **job_config_args)[source]#
Parameters:

labels (Dict[str, str])

Return type:

google.cloud.bigquery.table.RowIterator

update_bq_dataset_retention(bq_dataset_path, retention_in_days, apply_retroactively=False)[source]#

Update default retention for a whole BQ dataset. This applies only to new tables unless apply_retroactively=True.

Parameters:
  • bq_dataset_path (str) – The BigQuery dataset path in the format project_id.dataset_id.

  • retention_in_days (int) – The number of days to retain data in BigQuery tables.

  • apply_retroactively (Optional[bool]) – If True, applies this retention policy retroactively to all existing tables in the dataset.

Return type:

None

update_bq_table_retention(bq_table_path, retention_in_days)[source]#

Update retention of a single BQ table. :param bq_table_path: :param retention_in_days: :param client: :return:

Parameters:
  • bq_table_path (str)

  • retention_in_days (int)

Return type:

None

gigl.src.common.utils.bq.logger[source]#