diff --git a/scripts/us_fed/treasury_constant_maturity_rates/golden_data/golden_summary_report.csv b/scripts/us_fed/treasury_constant_maturity_rates/golden_data/golden_summary_report.csv new file mode 100644 index 0000000000..1c40614669 --- /dev/null +++ b/scripts/us_fed/treasury_constant_maturity_rates/golden_data/golden_summary_report.csv @@ -0,0 +1,6 @@ +StatVar,NumPlaces,MinDate,MeasurementMethods,Units +InterestRate_TreasuryNote_3Year,1,1962-01-02,[ConstantMaturityRate],[Percent] +InterestRate_TreasuryBond_20Year,1,1962-01-02,[ConstantMaturityRate],[Percent] +InterestRate_TreasuryNote_5Year,1,1962-01-02,[ConstantMaturityRate],[Percent] +InterestRate_TreasuryNote_10Year,1,1962-01-02,[ConstantMaturityRate],[Percent] +InterestRate_TreasuryBill_1Year,1,1962-01-02,[ConstantMaturityRate],[Percent] diff --git a/scripts/us_fed/treasury_constant_maturity_rates/golden_data/golden_treasury_constant_maturity_rates.csv b/scripts/us_fed/treasury_constant_maturity_rates/golden_data/golden_treasury_constant_maturity_rates.csv new file mode 100644 index 0000000000..92ebb1a070 --- /dev/null +++ b/scripts/us_fed/treasury_constant_maturity_rates/golden_data/golden_treasury_constant_maturity_rates.csv @@ -0,0 +1,4 @@ +date,1-Month,3-Month,6-Month,1-Year,2-Year,3-Year,5-Year,7-Year,10-Year,20-Year,30-Year +1962-01-02,,,,3.22,,3.70,3.88,,4.06,4.07, +1962-02-01,,,,3.30,,3.81,4.00,,4.09,4.13, +1962-04-19,,,,3.00,,3.37,3.60,,3.82,3.91, diff --git a/scripts/us_fed/treasury_constant_maturity_rates/validation_config.json b/scripts/us_fed/treasury_constant_maturity_rates/validation_config.json index b92d23c5bd..0a4b242b16 100644 --- a/scripts/us_fed/treasury_constant_maturity_rates/validation_config.json +++ b/scripts/us_fed/treasury_constant_maturity_rates/validation_config.json @@ -31,6 +31,27 @@ "params": { "threshold": 0 } + }, + { + "rule_id": "check_goldens_output_csv", + "validator": "GOLDENS", + "scope": { + "data_source": "stats" + }, + "params": { + "golden_files": "golden_data/golden_treasury_constant_maturity_rates.csv", + "input_files": "treasury_constant_maturity_rates.csv" + } + }, + { + "rule_id": "check_goldens_summary_report", + "validator": "GOLDENS", + "scope": { + "data_source": "stats" + }, + "params": { + "golden_files": "golden_data/golden_summary_report.csv" + } } ] -} \ No newline at end of file +} diff --git a/tools/import_validation/README.md b/tools/import_validation/README.md index 54406457df..fa89b78adf 100644 --- a/tools/import_validation/README.md +++ b/tools/import_validation/README.md @@ -158,10 +158,41 @@ The following validations are currently supported: | `NUM_OBSERVATIONS_CHECK` | Checks that the number of observations is within a defined range. | `stats` | `minimum`, `maximum`, or `value` (integer) | | `UNIT_CONSISTENCY_CHECK` | Checks that the unit is the same for all StatVars. | `stats` | None | | `MIN_VALUE_CHECK` | Checks that the minimum value is not below a defined minimum. | `stats` | `minimum` (integer or float) | -| `MAX_VALUE_CHECK` | Checks that the maximum value is not above a defined maximum. | `stats` | `maximum` (integer or float) | +| MAX_VALUE_CHECK | Checks that the maximum value is not above a defined maximum. | `stats` | `maximum` (integer or float) | +| `GOLDENS` | Verifies that the data contains all records defined in a golden set. | `stats` | `golden_files` (list), `input_files` (list) | + +### Golden Set Validation with `GOLDENS` + +The `GOLDENS` validator ensures that your import contains a specific set of expected records. This is useful for verifying that critical StatVars, Places, or specific metadata combinations are always present in your output. + +The validator compares the input data (usually from the `stats` data source) against one or more "golden" files (MCF or CSV). + +#### Configuration Parameters +- `golden_files`: A list or glob pattern of golden MCF or CSV files to compare against. +- `goldens_key_property`: A list of properties to match on. If not specified, all properties in the golden record must match. +- `input_files`: (Optional) Path to specific input files. If not provided, the data source defined in the rule's `scope` is used. + +#### GOLDENS Validator Example + +**Rule:** "Ensure that observations for `Count_Person` and `Median_Age_Person` are present in the import as defined in our critical golden set." + +```json +{ + "rule_id": "verify_critical_obs", + "validator": "GOLDENS", + "scope": { + "data_source": "stats" + }, + "params": { + "golden_files": ["goldens/critical_stats.csv"], + "input_files": "processed_obs.csv" + } +} +``` ## Output + The framework generates a report file (specified by the `--validation_output` flag) with the results of each validation. The format of the report is determined by the file extension (`.csv` or `.json`). ### CSV Output diff --git a/tools/import_validation/runner.py b/tools/import_validation/runner.py index a3bbbde132..809269aaee 100644 --- a/tools/import_validation/runner.py +++ b/tools/import_validation/runner.py @@ -77,6 +77,7 @@ def __init__(self, validation_config_path: str, differ_output: str, (self.validator.validate_min_value_check, 'stats'), 'MAX_VALUE_CHECK': (self.validator.validate_max_value_check, 'stats'), + 'GOLDENS': (self.validator.validate_goldens, 'stats'), } self._initialize_data_sources(stats_summary, lint_report, differ_output) @@ -166,10 +167,20 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: validation_func, data_source_key = self.validation_dispatch[ validator_name] + rule_params = dict(rule.get('params', {})) + if rule_params: + # Add default parameters for output folder + output_dir = self.validation_output + if output_dir and not output_dir.endswith( + '/') and not os.path.isdir(output_dir): + output_dir = os.path.dirname(output_dir) + if output_dir: + rule_params.setdefault('output_path', output_dir) + if validator_name == 'SQL_VALIDATOR': result = validation_func(self.data_sources['stats'], self.data_sources['differ'], - rule['params']) + rule_params) else: scope = rule['scope'] if isinstance(scope, str): @@ -185,7 +196,7 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]: regex_patterns=variables_config.get('regex'), contains_all=variables_config.get('contains_all')) - result = validation_func(df, rule['params']) + result = validation_func(df, rule_params) result.name = rule['rule_id'] result.validation_params = rule.get('params', {}) diff --git a/tools/import_validation/validator.py b/tools/import_validation/validator.py index b8eec807ce..a4fd19dabd 100644 --- a/tools/import_validation/validator.py +++ b/tools/import_validation/validator.py @@ -20,8 +20,12 @@ _SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(_SCRIPT_DIR) +_DATA_DIR = os.path.join(_SCRIPT_DIR.split('/data/')[0], 'data') +sys.path.append(os.path.join(_DATA_DIR, 'util')) from result import ValidationResult, ValidationStatus +from counters import Counters +import validator_goldens class Validator: @@ -847,3 +851,65 @@ def validate_max_value_check(self, stats_df: pd.DataFrame, 'rows_succeeded': rows_succeeded, 'rows_failed': rows_failed }) + + def validate_goldens(self, df: pd.DataFrame, + params: dict) -> ValidationResult: + """Validates records against a golden set. + + Args: + df: A DataFrame containing the data to validate (used if input_files + is not provided in params). + params: A dictionary containing: + 'golden_files': Path(s) to golden MCF/CSV files. + 'input_files': (Optional) Path(s) to input files. If not provided, + the 'df' will be used. + 'output_path': (Optional) folder or output filename to save missing goldens. + And other optional validator_goldens config (e.g., goldens_key_property). + + Returns: + A ValidationResult object. + """ + golden_files = params.get('golden_files') + if not golden_files: + return ValidationResult( + ValidationStatus.CONFIG_ERROR, + 'GOLDENS', + message= + "Configuration error: 'golden_files' must be specified for GOLDENS validator." + ) + + try: + inputs = params.get('input_files') + if not inputs: + inputs = df.to_dict('index') + output_path = params.get('output_path') + # Compare nodes + counters = Counters() + missing_goldens = validator_goldens.validate_goldens( + inputs, + golden_files, + output_path, + config=params, + counters=counters) + details = { + name: value + for name, value in counters.get_counters().items() + if 'golden' in name + } + if not missing_goldens: + return ValidationResult(ValidationStatus.PASSED, + 'GOLDENS', + details=details) + details['missing_goldens'] = missing_goldens + + return ValidationResult( + ValidationStatus.FAILED, + 'GOLDENS', + message=f"Found {len(missing_goldens)} missing golden records.", + details=details) + + except Exception as e: + return ValidationResult( + ValidationStatus.DATA_ERROR, + 'GOLDENS', + message=f"Error during golden validation: {e}") diff --git a/tools/import_validation/validator_goldens.py b/tools/import_validation/validator_goldens.py new file mode 100644 index 0000000000..72bfcaca7a --- /dev/null +++ b/tools/import_validation/validator_goldens.py @@ -0,0 +1,528 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions to validate data with golden sets. + +This module provides tools to compare sets of nodes (e.g., from CSV or MCF files) +against a "golden" set of expected nodes. It supports flexible matching based on +configurable property sets and handles normalization of values (like stripping +namespaces from DCIDs). + +Example Use Case: Validating StatVarObservations +------------------------------------------------ +You can use this to ensure that your import contains expected observations. + +1. Validate based on variableMeasured and observationAbout: + Config: {'goldens_key_property': ['variableMeasured', 'observationAbout']} + This will check that for every golden observation, an input observation exists + with the same StatVar and Place, regardless of the value or time. + +2. Validate based on a combination of metadata: + Config: { + 'goldens_key_property': [ + 'variableMeasured', 'unit', 'scalingFactor', 'measurementMethod' + ] + } + This ensures that the specific measurement metadata combinations defined in + your goldens are present in the input nodes. + +Usage: + python3 validator_goldens.py \ + --validate_goldens_input=output/observations.csv \ + --validate_goldens=goldens/expected_obs.mcf \ + --goldens_key_property=variableMeasured,observationAbout + + # To generate goldens from input: + python3 validator_goldens.py \ + --validate_goldens_input=output/observations.csv \ + --generate_goldens_property_sets="variableMeasured|observationAbout,observationDate,variableMeasured|unit|scalingFactor|observationPeriod|measurementMethod" \ + --generate_goldens=goldens_data/generated_goldens.csv + + # To generate goldens using a sample of input nodes: + python3 validator_goldens.py \ + --validate_goldens_input=output/observations.csv \ + --goldens_sample_rows=100 \ + --generate_goldens_property_sets="variableMeasured|observationAbout" \ + --generate_goldens=goldens_data/generated_goldens.csv + + # To generate goldens capturing every unique value in every column: + python3 validator_goldens.py \ + --validate_goldens_input=output/observations.mcf \ + --goldens_sampler_exhaustive \ + --generate_goldens=goldens_data/generated_goldens.mcf + + # To generate goldens ensuring prominent DCIDs are included if present: + python3 validator_goldens.py \ + --validate_goldens_input=output/observations.csv \ + --goldens_must_include="variableMeasured:selected_svs.txt,observationAbout:selected_places.txt" \ + --generate_goldens=goldens_data/generated_goldens.csv +""" + +import os +import sys +import tempfile + +from absl import app +from absl import flags +from absl import logging + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_DATA_DIR = os.path.join(_SCRIPT_DIR.split('/data/')[0], 'data') +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(_DATA_DIR) +sys.path.append(os.path.join(_DATA_DIR, 'util')) +sys.path.append(os.path.join(_DATA_DIR, 'tools', 'statvar_importer')) + +import file_util +import mcf_file_util +import data_sampler + +from counters import Counters +from mcf_diff import fingerprint_node + +flags.DEFINE_list('validate_goldens_input', None, + 'List of files to be compared against goldens.') +flags.DEFINE_list('validate_goldens', None, + 'List of golden files to be compared against') +flags.DEFINE_string('generate_goldens', None, + 'Golden file to be generated from the input.') +flags.DEFINE_string('validate_goldens_output', None, + 'Output file with missing goldens') +flags.DEFINE_list('goldens_key_property', [], + 'Properties in golden nodes to be compared.') +flags.DEFINE_list('goldens_ignore_property', ['value'], + 'Properties in golden nodes to be ignored.') +flags.DEFINE_list( + 'generate_goldens_property_sets', [], + 'List of property sets to generate goldens for. ' + 'Each set is a pipe (|) separated list of properties. ' + 'Example: "variableMeasured|observationAbout,observationDate"') +flags.DEFINE_integer( + 'goldens_sample_rows', 0, + 'Number of input rows to sample for generating goldens. ' + 'If 0, all rows are used.') +flags.DEFINE_boolean( + 'goldens_sampler_exhaustive', False, + 'If True, uses exhaustive sampling to capture every ' + 'unique value in the input nodes.') +flags.DEFINE_list( + 'goldens_must_include', [], + 'List of "column:file" pairs containing values (e.g. prominent DCIDs) ' + 'that MUST be included in the generated goldens if they appear ' + 'in the input data. ' + 'Example: "variableMeasured:website/tools/nl/embeddings/input/base/sheets_svs.csv,observationAbout:places.txt"' +) +flags.DEFINE_string('goldens_ignore_comments', '#', + 'Prefix for comments to be ignored in the golden set.') + +_FLAGS = flags.FLAGS + + +def get_validator_goldens_config() -> dict: + """Returns a dictionary of config parameters for MCF diff from flags. + + The config includes properties to ignore and properties to use as keys + for matching nodes, derived from command-line flags. + """ + if not _FLAGS.is_parsed(): + _FLAGS.mark_as_parsed() + return { + 'goldens_ignore_property': _FLAGS.goldens_ignore_property, + 'goldens_key_property': _FLAGS.goldens_key_property, + 'goldens_must_include': _FLAGS.goldens_must_include, + 'goldens_ignore_comments': _FLAGS.goldens_ignore_comments, + + # config options for data_sampler when generating goldens + 'sampler_output_rows': _FLAGS.goldens_sample_rows, + 'sampler_exhaustive': _FLAGS.goldens_sampler_exhaustive, + 'sampler_column_keys': _FLAGS.goldens_must_include, + } + + +def _is_commented_node(fingerprint: str, comment_char: str = '#') -> bool: + """Returns True if the node fingerprint is commented. + + Args: + fingerprint: string fingerprint of the node of the form 'prop=value;...' + + Returns: + True if any property or value is commented. + """ + if not comment_char: + return False + if fingerprint.startswith( + comment_char + ) or f';{comment_char}' in fingerprint or f'={comment_char}' in fingerprint: + return True + return False + + +# Compare nodes in a dictionary to nodes in a golden set +def validator_compare_nodes(input_nodes: dict, + golden_nodes: dict, + config: dict = None, + counters: Counters = None) -> list: + """Returns a summary of the differences in the input and golden nodes. + + It only compares the properties defined in the golden nodes against the + corresponding properties in the input_nodes. + + Args: + input_nodes: dictionary of nodes which are dictionary of property:values. + { : { : ,,,}, : { : ..} + golden_nodes: dictionary of key to expected nodes with property:values. + These nodes may have fewer properties than input_nodes. + config: dictionary of config parameters such as ignore lists and + normalization settings. + counters: Output counters for tracking match statistics. + + Returns: + A list of fingerprints for golden nodes that were not matched in the input. + """ + + if counters is None: + counters = Counters() + + if config is None: + config = get_validator_goldens_config() + + # Extract configuration parameters with defaults. + ignore_props = config.get('goldens_ignore_property', {}) + comment_char = config.get('goldens_ignore_comments', '#') + golden_key_props = set(config.get('goldens_key_property', {})) + key_delimiter = config.get('golden_key_delimiter', '|') + + # Step 1: Group golden nodes by their set of properties. + # Goldens may have a subset of the input node properties and would match + # any input node that contains all the golden property:values. + # Different golden nodes might specify different subsets of properties to match on. + golden_key_sets = {} + golden_matches = dict() + logging.debug(f'Extracting properties for {len(golden_nodes)} goldens') + for node_key, node in golden_nodes.items(): + node_props = set() + for prop in node.keys(): + if not prop: + continue + if comment_char and prop.startswith(comment_char): + continue + if prop in ignore_props: + continue + if golden_key_props and prop not in golden_key_props: + continue + node_props.add(prop) + + if not node_props: + counters.add_counter('validate-goldens-commented', 1) + continue + # Use the joined sorted property names as a key for the group. + node_props_key = key_delimiter.join(sorted(list(node_props))) + golden_key_sets[node_props_key] = node_props + + # Initialize match count for the golden node to 0. + key = fingerprint_node(node, compare_props=node_props) + golden_matches.setdefault(key, {'node': node, 'matches': 0}) + + logging.info( + f'Comparing {len(input_nodes)} nodes against {len(golden_matches)} goldens in {len(golden_key_sets)} sets using properties: {golden_key_sets.keys()}' + ) + counters.add_counter('validate-goldens-sets', len(golden_key_sets)) + counters.add_counter('validate-goldens-inputs', len(input_nodes)) + counters.add_counter('validate-goldens-expected', len(golden_matches)) + + # Step 2: Match each input node with the golden fingerprints. + # An input node may match more than one golden node with different + # set of property:values. + for node in input_nodes.values(): + # An input node might match different golden "shapes" (sets of properties). + for node_key_props in golden_key_sets.values(): + key = fingerprint_node(node, + compare_props=node_key_props, + ignore_props=ignore_props) + if key in golden_matches: + golden_matches[key]['matches'] += 1 + counters.add_counter('validate-goldens-input-matched', 1) + + # Step 3: Identify which golden fingerprints had no corresponding input nodes. + missing_goldens = [] + for key, node_counts in golden_matches.items(): + count = node_counts.get('matches', 0) + if count > 0: + # This key got matches. + counters.add_counter('validate-goldens-matched', 1) + else: + if _is_commented_node(key, comment_char): + # No matches for this key. Ignore commented keys. + counters.add_counter('validate-goldens-ignored', 1) + else: + missing_goldens.append(node_counts.get('node')) + counters.add_counter('validate-goldens-missing', 1) + + if missing_goldens: + logging.error( + f'Missing {len(missing_goldens)} among {len(golden_nodes)} goldens in {len(input_nodes)} input nodes.' + ) + logging.debug(f'Missing goldens: {missing_goldens}') + else: + logging.info( + f'Goldens match successful: {len(golden_nodes)} goldens matched {len(input_nodes)} inputs' + ) + + return missing_goldens + + +def load_nodes_from_file(files: str) -> dict: + """Returns a dictionary of nodes loaded from the files. + + Supports CSV and MCF formats. + - CSV files: Each row is loaded as a node. + - MCF files: Each node is loaded based on its DCID. + """ + nodes = {} + input_files = file_util.file_get_matching(files) + for input_file in input_files: + if file_util.file_is_csv(input_file): + # For CSV, we treat each row as a dictionary of column:value. + # Nodes are keyed by their index in the combined loaded set. + file_nodes = file_util.file_load_csv_dict(input_file, + key_index=True) + for node in file_nodes.values(): + nodes[len(nodes)] = node + else: + # For MCF or JSON, we assume nodes are already keyed by DCID. + file_nodes = mcf_file_util.load_mcf_nodes(input_file) + for dcid, node in file_nodes.items(): + # Ensure the dcid is present in the node dictionary itself. + if 'dcid' not in node: + node['dcid'] = mcf_file_util.strip_namespace(dcid) + mcf_file_util.add_mcf_node(node, nodes) + + logging.info(f'Loaded {len(nodes)} nodes from {input_files}') + return nodes + + +def generate_goldens(input_files: str, + property_sets: list, + output_file: str = None, + config: dict = None, + counters: Counters = None) -> dict: + """Generates a set of unique golden nodes from input files. + + For each input node and each property set in property_sets, it extracts + the values for those properties and creates a unique golden node. + If sampling is requested, a representative sample of input nodes is used + as the basis for generating the golden nodes. + + Args: + input_files: Glob pattern or list of input data files. + property_sets: List of sets/lists of properties to extract. + Example: [{'variableMeasured'}, {'observationAbout', 'variableMeasured'}] + output_file: Path to write the generated goldens to (MCF format). + config: Configuration for normalization and sampling. + counters: Output counters. + + Returns: + A dictionary of unique golden nodes keyed by their fingerprints. + """ + if counters is None: + counters = Counters() + + if config is None: + config = get_validator_goldens_config() + + # Apply sampling if requested. + sampler_rows = config.get('sampler_output_rows', 0) + exhaustive = config.get('sampler_exhaustive', False) + must_include_values = data_sampler.load_column_keys( + config.get('sampler_column_keys', [])) + if must_include_values: + for col, vals in must_include_values.items(): + counters.add_counter(f'generate-goldens-include-{col}', len(vals)) + if sampler_rows > 0 or exhaustive: + logging.info( + f'Sampling rows from {input_files} (exhaustive={exhaustive}, rows={sampler_rows})' + ) + if exhaustive: + config['sampler_column_regex'] = '.*' + + # Generate a representative sample with unique values across columns. + with tempfile.NamedTemporaryFile(mode='w+t', suffix='.csv', + delete=True) as sampled_file: + sampler = data_sampler.DataSampler(config_dict=config, + counters=counters) + sampler.sample_csv_file(input_files, output_file=sampled_file.name) + input_nodes = load_nodes_from_file(sampled_file.name) + logging.info( + f'Using sampled file: {sampled_file} with {len(input_nodes)} nodes' + ) + counters.add_counter(f'generate-goldens-sampled-nodes', + len(input_nodes)) + else: + input_nodes = load_nodes_from_file(input_files) + counters.add_counter('generate-goldens-input-nodes', len(input_nodes)) + + # If not sampling, but must_include_values are provided, use them as a filter + # to focus goldens on prominent DCIDs if requested. + if must_include_values: + filtered_nodes = {} + for k, node in input_nodes.items(): + match = False + for col, vals in must_include_values.items(): + if node.get(col) in vals: + match = True + break + if match: + filtered_nodes[k] = node + + logging.info( + f'Filtered {len(input_nodes)} nodes down to {len(filtered_nodes)} matching prominent DCIDs.' + ) + input_nodes = filtered_nodes + counters.add_counter('generate-goldens-filtered-nodes', + len(filtered_nodes)) + + ignore_props = set(config.get('goldens_ignore_property', [])) + + golden_nodes = {} + for node in input_nodes.values(): + # If no property sets are provided, use all properties in the current node + # except those that are explicitly ignored. + effective_property_sets = property_sets + if not effective_property_sets: + node_props = set(node.keys()) - ignore_props + if node_props: + effective_property_sets = [node_props] + else: + continue + + for props in effective_property_sets: + # Create a dictionary for this specific property set from the input node. + golden_node = {} + has_all_props = True + for prop in props: + + if prop in node: + golden_node[prop] = node[prop] + else: + # If a node is missing one of the properties in a set, + # we skip this combination. + has_all_props = False + break + + if not has_all_props or not golden_node: + continue + + # Generate a unique key for this golden node shape. + key = fingerprint_node(golden_node, compare_props=props) + + if key not in golden_nodes: + golden_nodes[key] = golden_node + counters.add_counter('generate-goldens-unique', 1) + + counters.add_counter('generate-goldens-processed', 1) + + logging.info( + f'Generated {len(golden_nodes)} unique goldens from {len(input_nodes)} input nodes.' + ) + counters.add_counter('generated-golden-output', len(golden_nodes)) + + if golden_nodes and output_file: + logging.info(f'Writing {len(golden_nodes)} goldens to {output_file}') + if file_util.file_is_csv(output_file): + file_util.file_write_csv_dict(golden_nodes, + output_file, + key_column_name=None) + else: + mcf_file_util.write_mcf_nodes([golden_nodes], output_file) + + return golden_nodes + + +def validate_goldens(inputs: str | dict, + golden_files: str, + output_file: str = None, + config: dict = None, + counters: Counters = None) -> list: + """Validate records in the input files against goldens. + + This is the high-level entry point for comparing two sets of files. + + Args: + inputs: Glob pattern for list of input data files or + dictionary of input nodes. + golden_files: Glob pattern or list of golden data files. + output_file: Path to write missing goldens to. + config: Validation configuration. + counters: Counters for tracking progress and results. + """ + if config is None: + config = get_validator_goldens_config() + + # Load all nodes from input and golden files. + if isinstance(inputs, dict): + input_nodes = inputs + else: + input_nodes = load_nodes_from_file(inputs) + golden_files_list = file_util.file_get_matching(golden_files) + golden_nodes = load_nodes_from_file(golden_files_list) + + # Run the core comparison logic. + missing_goldens = validator_compare_nodes(input_nodes, golden_nodes, config, + counters) + + # Optionally write out the missing golden nodes for debugging. + if missing_goldens and output_file: + if output_file.endswith('/') or os.path.isdir(output_file): + # Append a default filename if only a directory was provided. + output_file = os.path.join( + output_file, + 'goldens_missing_' + os.path.basename(golden_files_list[0])) + logging.info( + f'Writing {len(missing_goldens)} missing goldens to {output_file}') + if file_util.file_is_csv(output_file): + file_util.file_write_csv_dict(dict(enumerate(missing_goldens)), + output_file) + else: + mcf_file_util.write_mcf_nodes(dict(enumerate(missing_goldens)), + output_file) + return missing_goldens + + +def main(_): + """Main entry point for the validator script.""" + logging.set_verbosity(2) + counters = Counters() + + if _FLAGS.generate_goldens: + # Generation Mode + property_sets = [] + for p_set_str in _FLAGS.generate_goldens_property_sets: + property_sets.append(set(p_set_str.split('|'))) + + generate_goldens(_FLAGS.validate_goldens_input, + property_sets, + output_file=_FLAGS.generate_goldens, + config=get_validator_goldens_config(), + counters=counters) + if _FLAGS.validate_goldens: + # Validation Mode + validate_goldens(_FLAGS.validate_goldens_input, + _FLAGS.validate_goldens, + output_file=_FLAGS.validate_goldens_output, + config=get_validator_goldens_config(), + counters=counters) + + +if __name__ == '__main__': + app.run(main) diff --git a/tools/import_validation/validator_goldens_test.py b/tools/import_validation/validator_goldens_test.py new file mode 100644 index 0000000000..5675195e99 --- /dev/null +++ b/tools/import_validation/validator_goldens_test.py @@ -0,0 +1,288 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for validator_goldens.py""" + +import os +import sys +import unittest +from unittest.mock import patch, MagicMock + +# Set up paths as in validator_goldens.py +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_DATA_DIR = os.path.join(_SCRIPT_DIR.split('/data/')[0], 'data') +sys.path.append(_SCRIPT_DIR) +sys.path.append(os.path.dirname(_SCRIPT_DIR)) +sys.path.append(_DATA_DIR) +sys.path.append(os.path.join(_DATA_DIR, 'util')) +sys.path.append(os.path.join(_DATA_DIR, 'tools', 'statvar_importer')) + +import validator_goldens +from counters import Counters + + +class TestValidatorGoldens(unittest.TestCase): + + def test_get_validator_goldens_config(self): + with patch('validator_goldens._FLAGS') as mock_flags: + mock_flags.goldens_ignore_property = ['p1'] + mock_flags.goldens_key_property = ['p2'] + config = validator_goldens.get_validator_goldens_config() + self.assertEqual(config['goldens_ignore_property'], ['p1']) + self.assertEqual(config['goldens_key_property'], ['p2']) + + @patch('validator_goldens.mcf_file_util') + def test_normalize_value(self, mock_mcf): + # Test string with quotes + self.assertEqual(validator_goldens.normalize_value(' "val" '), 'val') + # Test float normalization + self.assertEqual(validator_goldens.normalize_value(1.20), '1.2') + # Test namespace stripping + mock_mcf.strip_namespace.return_value = 'val' + self.assertEqual( + validator_goldens.normalize_value('dcid:val', + strip_namespaces=True), 'val') + mock_mcf.strip_namespace.assert_called_with('dcid:val') + # Test list normalization + mock_mcf.normalize_list.return_value = 'v1,v2' + self.assertEqual(validator_goldens.normalize_value('v1,v2'), 'v1,v2') + mock_mcf.normalize_list.assert_called_with('v1,v2') + # Test alphanumeric string + self.assertEqual(validator_goldens.normalize_value('simple'), 'simple') + + def test_get_node_fingerprint(self): + node = {'p1': 'v1', 'p2': 'v2', 'p3': 'v3'} + # All properties (default) + self.assertEqual(validator_goldens.get_node_fingerprint(node), + 'p1=v1;p2=v2;p3=v3') + # Specific key properties + self.assertEqual( + validator_goldens.get_node_fingerprint(node, + key_property={'p1', 'p3'}), + 'p1=v1;p3=v3') + # Ignore properties + self.assertEqual( + validator_goldens.get_node_fingerprint(node, + ignore_property={'p2'}), + 'p1=v1;p3=v3') + # Combined key and ignore + self.assertEqual( + validator_goldens.get_node_fingerprint(node, + key_property={'p1', 'p2'}, + ignore_property={'p2'}), + 'p1=v1') + + def test_validator_compare_nodes(self): + input_nodes = { + 'n1': { + 'p1': 'v1', + 'p2': 'v2' + }, + 'n2': { + 'p1': 'v3', + 'p2': 'v4' + } + } + golden_nodes = {'g1': {'p1': 'v1'}, 'g2': {'p1': 'v5'}} + config = {'goldens_key_property': ['p1']} + counters = Counters() + missing = validator_goldens.validator_compare_nodes( + input_nodes, golden_nodes, config, counters) + # Expected fingerprint for g2 is p1=v5, which is not in input_nodes + self.assertEqual(missing, ['p1=v5']) + self.assertEqual(counters.get_counter('validate-goldens-missing'), 1) + self.assertEqual(counters.get_counter('validate-goldens-matched'), 1) + + def test_validator_compare_nodes_multiple_sets(self): + input_nodes = { + 'n1': { + 'p1': 'v1', + 'p2': 'v2' + }, + 'n2': { + 'p1': 'v1', + 'p3': 'v3' + } + } + golden_nodes = { + 'g1': { + 'p1': 'v1', + 'p2': 'v2' + }, + 'g2': { + 'p1': 'v1', + 'p3': 'v3' + } + } + # config empty, so it will group by all props in each golden node + counters = Counters() + missing = validator_goldens.validator_compare_nodes( + input_nodes, golden_nodes, {}, counters) + self.assertEqual(missing, []) + self.assertEqual(counters.get_counter('validate-goldens-matched'), 2) + + @patch('validator_goldens.file_util') + @patch('validator_goldens.mcf_file_util') + def test_load_nodes_from_file(self, mock_mcf, mock_file): + mock_file.file_get_matching.return_value = ['f1.csv', 'f2.mcf'] + mock_file.file_is_csv.side_effect = lambda x: x.endswith('.csv') + mock_file.file_load_csv_dict.return_value = {0: {'p1': 'v1'}} + mock_mcf.load_file_nodes.return_value = {'dcid:n1': {'p1': 'v2'}} + mock_mcf.strip_namespace.return_value = 'n1' + + def side_effect_add(pvs, nodes, **kwargs): + nodes[pvs['dcid']] = pvs + return True + + mock_mcf.add_mcf_node.side_effect = side_effect_add + + nodes = validator_goldens.load_nodes_from_file('dummy') + + self.assertEqual(len(nodes), 2) + self.assertIn(0, nodes) + self.assertEqual(nodes[0]['p1'], 'v1') + self.assertIn('n1', nodes) + self.assertEqual(nodes['n1']['p1'], 'v2') + + @patch('validator_goldens.load_nodes_from_file') + @patch('validator_goldens.mcf_file_util') + def test_generate_goldens(self, mock_mcf, mock_load): + mock_load.return_value = { + 0: { + 'variableMeasured': 'sv1', + 'observationAbout': 'geo1', + 'value': 10 + }, + 1: { + 'variableMeasured': 'sv2', + 'observationAbout': 'geo1', + 'value': 20 + }, + } + property_sets = [{'variableMeasured'}, {'variableMeasured', 'observationAbout'}] + + goldens = validator_goldens.generate_goldens('dummy', property_sets) + + # Unique goldens expected: + # 1. variableMeasured=sv1 + # 2. variableMeasured=sv2 + # 3. observationAbout=geo1;variableMeasured=sv1 + # 4. observationAbout=geo1;variableMeasured=sv2 + self.assertEqual(len(goldens), 4) + self.assertIn('variableMeasured=sv1', goldens) + self.assertIn('variableMeasured=sv2', goldens) + self.assertIn('observationAbout=geo1;variableMeasured=sv1', goldens) + self.assertIn('observationAbout=geo1;variableMeasured=sv2', goldens) + + @patch('validator_goldens.load_nodes_from_file') + @patch('validator_goldens.mcf_file_util') + @patch('validator_goldens.data_sampler') + def test_generate_goldens_with_sampling(self, mock_sampler, mock_mcf, mock_load): + mock_sampler.sample_csv_file.return_value = 'tmp-sample.csv' + mock_load.return_value = {0: {'p1': 'v1'}} + + property_sets = [{'p1'}] + config = {'sampler_output_rows': 10} + + + with patch('os.path.exists', return_value=True), patch('os.remove') as mock_remove: + goldens = validator_goldens.generate_goldens('input.csv', property_sets, config=config) + + self.assertEqual(len(goldens), 1) + mock_sampler.sample_csv_file.assert_called_once() + mock_load.assert_called_with('tmp-sample.csv') + mock_remove.assert_called_with('tmp-sample.csv') + + @patch('validator_goldens.load_nodes_from_file') + @patch('validator_goldens.mcf_file_util') + def test_generate_goldens_all_props(self, mock_mcf, mock_load): + mock_load.return_value = { + 0: { + 'p1': 'v1', + 'p2': 'v2', + 'ignore_me': 'x' + } + } + # property_sets is empty, should use all props except ignore_me + property_sets = [] + config = {'goldens_ignore_property': ['ignore_me']} + + goldens = validator_goldens.generate_goldens('dummy', property_sets, config=config) + + self.assertEqual(len(goldens), 1) + key = list(goldens.keys())[0] + # p1=v1;p2=v2 (alphabetical) + self.assertEqual(key, 'p1=v1;p2=v2') + self.assertIn('p1', goldens[key]) + self.assertIn('p2', goldens[key]) + self.assertNotIn('ignore_me', goldens[key]) + + @patch('validator_goldens.load_nodes_from_file') + @patch('validator_goldens.mcf_file_util') + def test_generate_goldens_with_must_include_values(self, mock_mcf, mock_load): + # Input has two nodes, but only one matches the prominent DCID filter. + mock_load.return_value = { + 0: {'p1': 'v1', 'p2': 'other'}, + 1: {'p1': 'v2', 'p2': 'other'} + } + # Filter for p1=v1 + must_include_values = {'p1': {'v1'}} + property_sets = [{'p1'}] + + goldens = validator_goldens.generate_goldens( + 'dummy', property_sets, must_include_values=must_include_values) + + # Only v1 should be included because of the filter (non-sampled mode). + self.assertEqual(len(goldens), 1) + self.assertIn('p1=v1', goldens) + self.assertNotIn('p1=v2', goldens) + + @patch('validator_goldens.load_nodes_from_file') + @patch('validator_goldens.mcf_file_util') + def test_generate_goldens_all_props_mixed_schema(self, mock_mcf, mock_load): + # input nodes have different columns + mock_load.return_value = { + 0: {'p1': 'v1'}, + 1: {'p2': 'v2'} + } + # property_sets is empty, should use each node's own props + property_sets = [] + + goldens = validator_goldens.generate_goldens('dummy', property_sets) + + self.assertEqual(len(goldens), 2) + self.assertIn('p1=v1', goldens) + self.assertIn('p2=v2', goldens) + + @patch('validator_goldens.load_nodes_from_file') + @patch('validator_goldens.validator_compare_nodes') + @patch('validator_goldens.file_util') + def test_validate_goldens(self, mock_file, mock_compare, mock_load): + mock_load.side_effect = [ + {'n1': { + 'p1': 'v1' + }}, # input + {'g1': { + 'p1': 'v1' + }} # golden + ] + mock_compare.return_value = [] + + missing = validator_goldens.validate_goldens('in', 'gold', 'out') + + self.assertEqual(missing, []) + mock_compare.assert_called_once() + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/import_validation/validator_integration_test.py b/tools/import_validation/validator_integration_test.py new file mode 100644 index 0000000000..b4a0bee0a1 --- /dev/null +++ b/tools/import_validation/validator_integration_test.py @@ -0,0 +1,129 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import pandas as pd +import unittest +import tempfile +import shutil +import csv + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, _SCRIPT_DIR) + +from validator import Validator +from result import ValidationStatus + + +class TestGoldensValidation(unittest.TestCase): + '''Test Class for the GOLDENS validation rule.''' + + def setUp(self): + self.validator = Validator() + self.test_dir = tempfile.mkdtemp() + + # Create a sample golden CSV + self.golden_file = os.path.join(self.test_dir, 'goldens.csv') + with open(self.golden_file, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['StatVar', 'NumPlaces']) + writer.writerow(['sv1', '10']) + writer.writerow(['sv2', '20']) + + # Create a sample input CSV that matches + self.input_file_match = os.path.join(self.test_dir, 'input_match.csv') + with open(self.input_file_match, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['StatVar', 'NumPlaces', 'Value']) + writer.writerow(['sv1', '10', '100']) + writer.writerow(['sv2', '20', '200']) + + # Create a sample input CSV that is missing a golden + self.input_file_missing = os.path.join(self.test_dir, 'input_missing.csv') + with open(self.input_file_missing, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['StatVar', 'NumPlaces', 'Value']) + writer.writerow(['sv1', '10', '100']) + # sv2 is missing + + def tearDown(self): + shutil.rmtree(self.test_dir) + + def test_validate_goldens_passes_with_matching_files(self): + params = { + 'golden_files': self.golden_file, + 'input_files': self.input_file_match, + 'goldens_key_property': ['StatVar', 'NumPlaces'] + } + # df is not used when input_files is in params + result = self.validator.validate_goldens(pd.DataFrame(), params) + self.assertEqual(result.status, ValidationStatus.PASSED) + + def test_validate_goldens_fails_with_missing_records(self): + params = { + 'golden_files': self.golden_file, + 'input_files': self.input_file_missing, + 'goldens_key_property': ['StatVar', 'NumPlaces'] + } + result = self.validator.validate_goldens(pd.DataFrame(), params) + self.assertEqual(result.status, ValidationStatus.FAILED) + self.assertIn('Found 1 missing golden records', result.message) + # Fingerprint of sv2: 'NumPlaces=20;StatVar=sv2' (alphabetical) + self.assertIn('StatVar=sv2', result.details['missing_goldens'][0]) + + def test_validate_goldens_uses_dataframe_when_input_files_missing(self): + # Sample DataFrame representing the stats data source + df = pd.DataFrame({ + 'StatVar': ['sv1', 'sv2'], + 'NumPlaces': [10, 20], + 'Value': [100, 200] + }) + params = { + 'golden_files': self.golden_file, + 'goldens_key_property': ['StatVar', 'NumPlaces'] + } + result = self.validator.validate_goldens(df, params) + self.assertEqual(result.status, ValidationStatus.PASSED) + + def test_validate_goldens_fails_with_missing_records_from_df(self): + # Sample DataFrame missing sv2 + df = pd.DataFrame({ + 'StatVar': ['sv1'], + 'NumPlaces': [10], + 'Value': [100] + }) + params = { + 'golden_files': self.golden_file, + 'goldens_key_property': ['StatVar', 'NumPlaces'] + } + result = self.validator.validate_goldens(df, params) + self.assertEqual(result.status, ValidationStatus.FAILED) + self.assertEqual(len(result.details['missing_goldens']), 1) + + def test_validate_goldens_missing_golden_files_param(self): + params = {'input_files': self.input_file_match} + result = self.validator.validate_goldens(pd.DataFrame(), params) + self.assertEqual(result.status, ValidationStatus.CONFIG_ERROR) + self.assertIn('golden_files', result.message) + + def test_validate_goldens_empty_df_error(self): + params = {'golden_files': self.golden_file} + result = self.validator.validate_goldens(pd.DataFrame(), params) + self.assertEqual(result.status, ValidationStatus.DATA_ERROR) + self.assertIn('provided data source is empty', result.message) + + +if __name__ == '__main__': + unittest.main() diff --git a/tools/statvar_importer/mcf_diff.py b/tools/statvar_importer/mcf_diff.py index 6b7cb79e1a..e7eaa825f8 100644 --- a/tools/statvar_importer/mcf_diff.py +++ b/tools/statvar_importer/mcf_diff.py @@ -299,7 +299,7 @@ def fingerprint_node(pvs: dict, for p in sorted(normalized_pvs.keys()): if p not in ignore_props: if not compare_props or p in compare_props: - fp.append(f'{p}:{normalized_pvs[p]}') + fp.append(f'{p}={normalized_pvs[p]}') return ';'.join(fp) diff --git a/tools/statvar_importer/mcf_file_util.py b/tools/statvar_importer/mcf_file_util.py index 2a78fde9db..62a6b5da92 100644 --- a/tools/statvar_importer/mcf_file_util.py +++ b/tools/statvar_importer/mcf_file_util.py @@ -43,6 +43,7 @@ from collections import OrderedDict import csv import glob +import hashlib import os import re import sys @@ -1051,6 +1052,7 @@ def write_mcf_nodes( node_dict.update(d) file_util.file_write_csv_dict(node_dict, filename) return + filename_base = os.path.basename(filename) with file_util.FileIO(filename, mode) as output_f: if header is not None: output_f.write(header) @@ -1061,6 +1063,11 @@ def write_mcf_nodes( node_keys = sorted(node_keys) for dcid in node_keys: node = nodes[dcid] + if 'dcid' not in node and 'Node' not in node: + # Generate a local dcid in a node copy + node = dict(node) + node['Node'] = f'l:{filename_base}/' + hashlib.md5( + str(dcid).encode('utf-8')).hexdigest() if sort: node = normalize_mcf_node(node, ignore_comments) pvs = node_dict_to_text(node, default_pvs) diff --git a/tools/statvar_importer/property_value_mapper.py b/tools/statvar_importer/property_value_mapper.py index 76d5209216..8c60f481a3 100644 --- a/tools/statvar_importer/property_value_mapper.py +++ b/tools/statvar_importer/property_value_mapper.py @@ -347,7 +347,7 @@ def _process_eval(self, pvs: dict, data_key: str) -> bool: self._log_every_n) if not eval_prop: eval_prop = data_key - if eval_data and eval_data != eval_str: + if eval_data is not None and eval_data != eval_str: pvs[eval_prop] = eval_data self._counters.add_counter('processed-eval', 1, eval_str) pvs.pop(eval_key) diff --git a/util/file_util.py b/util/file_util.py index e1d7fc1f1c..f375cb76bf 100644 --- a/util/file_util.py +++ b/util/file_util.py @@ -574,7 +574,7 @@ def file_load_csv_dict( if not key_column: # Use the first column as the key key_column = reader.fieldnames[0] - if not value_column and len(reader.fieldnames) == 2: + if not value_column and len(reader.fieldnames) == 2 and not key_index: # Use second column as value if there are only two columns. value_column = reader.fieldnames[1] logging.info(