gigl.src.common.utils.bq#
Attributes#
Classes#
Module Contents#
- class gigl.src.common.utils.bq.BqUtils(project=None)[source]#
- Parameters:
- project (Optional[str]) 
 - check_columns_exist_in_table(bq_table, columns)[source]#
- Parameters:
- bq_table (str) 
- columns (Iterable[str]) 
 
- Return type:
- None 
 
 - count_number_of_rows_in_bq_table(bq_table, labels={})[source]#
- Parameters:
- bq_table (str) 
- labels (dict[str, str]) 
 
- Return type:
- int 
 
 - create_or_empty_bq_table(bq_path, schema=None)[source]#
- Parameters:
- bq_path (str) 
- schema (Optional[list[google.cloud.bigquery.SchemaField]]) 
 
- Return type:
- None 
 
 - delete_bq_table_if_exist(bq_table_path, not_found_ok=True)[source]#
- bq_table_path = ‘your-project.your_dataset.your_table’ - Parameters:
- bq_table_path (str) 
- not_found_ok (bool) 
 
- Return type:
- None 
 
 - delete_matching_tables(bq_dataset_path, table_match_string)[source]#
- Parameters:
- bq_dataset_path (str) 
- table_match_string (str) 
 
- Return type:
- None 
 
 - export_to_gcs(bq_table_path, destination_gcs_uri, destination_format='NEWLINE_DELIMITED_JSON')[source]#
- Export a BigQuery table to Google Cloud Storage. - Parameters:
- bq_table_path (str) – The full BigQuery table path to export. 
- destination_gcs_uri (str) – The destination GCS URI where the table will be exported. If the gcs uri has * in it, the table will be exported to multiple shards. 
- destination_format (str, optional) – The format of the exported data. Defaults to ‘NEWLINE_DELIMITED_JSON’. ‘CSV’, ‘AVRO’, ‘PARQUET’ also available. 
 
- Return type:
- None 
 
 - fetch_bq_table_schema(bq_table)[source]#
- Create a dictionary representation for SchemaFields from BigQuery table. - Parameters:
- bq_table (str) 
- Return type:
- dict[str, google.cloud.bigquery.SchemaField] 
 
 - static format_bq_path(bq_path, format_for_table_reference=False)[source]#
- Formats BQ paths. - Parameters:
- bq_path (str) – expected to be one of: “<project>.<dataset>.<table>” or “<project>:<dataset>.<table>” “<project>.<dataset>” or “<project>:<dataset>” “<dataset>.<table>” 
- format_for_table_reference (bool, optional) – If project, dataset, and 
- ` (table are all specified; add the) – ` seperator between project and dataset. 
- for (Useful for when "table_reference" is required instead of path i.e.) 
- pipelines. (using BigQuery IO operator for beam) 
- False. (Defaults to) 
 
- Returns:
- Formatted bq path 
- Return type:
- str 
 
 - get_table_names_within_date_range(bq_dataset_path, table_match_string, start_date, end_date)[source]#
- start_date and end_date are in the format of ‘YYYYMMDD’ table_match_string is a regex string to match table names - Parameters:
- bq_dataset_path (str) 
- table_match_string (str) 
- start_date (str) 
- end_date (str) 
 
- Return type:
- list[str] 
 
 - list_matching_tables(bq_dataset_path, table_match_string)[source]#
- Parameters:
- bq_dataset_path (str) 
- table_match_string (str) 
 
- Return type:
- list[str] 
 
 - load_file_to_bq(source_path, bq_path, job_config, retry=False)[source]#
- Uploads a single file to biqquery. - Parameters:
- source_path (Uri) – The source file to upload. 
- bq_path (str) – The BigQuery table path to upload to. 
- job_config (bigquery.LoadJobConfig) – The job configuration for the upload. 
- retry (bool, optional) – Whether to retry the upload if it fails. Defaults to False. 
 
- Return type:
- google.cloud.bigquery.job._AsyncJob 
 - Returns: The job object for the upload. 
 - load_rows_to_bq(bq_path, schema, rows)[source]#
- Parameters:
- bq_path (str) 
- schema (list[google.cloud.bigquery.SchemaField]) 
- rows (Iterable[Tuple]) 
 
- Return type:
- None 
 
 - static parse_bq_table_path(bq_table_path)[source]#
- Parses a joined bq table path into its project, dataset, and table names :param bq_table_path: Joined bq table path of format project.dataset.table :type bq_table_path: str - Returns:
- Parsed BQ Project ID bq_dataset_id (str): Parsed Dataset ID bq_table_name (str): Parsed Table Name 
- Return type:
- bq_project_id (str) 
- Parameters:
- bq_table_path (str) 
 
 - run_query(query, labels, **job_config_args)[source]#
- Parameters:
- labels (dict[str, str]) 
- Return type:
- google.cloud.bigquery.table.RowIterator 
 
 - update_bq_dataset_retention(bq_dataset_path, retention_in_days, apply_retroactively=False)[source]#
- Update default retention for a whole BQ dataset. This applies only to new tables unless apply_retroactively=True. - Parameters:
- bq_dataset_path (str) – The BigQuery dataset path in the format project_id.dataset_id. 
- retention_in_days (int) – The number of days to retain data in BigQuery tables. 
- apply_retroactively (Optional[bool]) – If True, applies this retention policy retroactively to all existing tables in the dataset. 
 
- Return type:
- None 
 
 
