Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
StatVar,NumPlaces,MinDate,MeasurementMethods,Units
InterestRate_TreasuryNote_3Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
InterestRate_TreasuryBond_20Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
InterestRate_TreasuryNote_5Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
InterestRate_TreasuryNote_10Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
InterestRate_TreasuryBill_1Year,1,1962-01-02,[ConstantMaturityRate],[Percent]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
date,1-Month,3-Month,6-Month,1-Year,2-Year,3-Year,5-Year,7-Year,10-Year,20-Year,30-Year
1962-01-02,,,,3.22,,3.70,3.88,,4.06,4.07,
1962-02-01,,,,3.30,,3.81,4.00,,4.09,4.13,
1962-04-19,,,,3.00,,3.37,3.60,,3.82,3.91,
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,27 @@
"params": {
"threshold": 0
}
},
{
"rule_id": "check_goldens_output_csv",
"validator": "GOLDENS",
"scope": {
"data_source": "stats"
},
"params": {
"golden_files": "golden_data/golden_treasury_constant_maturity_rates.csv",
"input_files": "treasury_constant_maturity_rates.csv"
}
},
{
"rule_id": "check_goldens_summary_report",
"validator": "GOLDENS",
"scope": {
"data_source": "stats"
},
"params": {
"golden_files": "golden_data/golden_summary_report.csv"
}
}
]
}
}
33 changes: 32 additions & 1 deletion tools/import_validation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,41 @@ The following validations are currently supported:
| `NUM_OBSERVATIONS_CHECK` | Checks that the number of observations is within a defined range. | `stats` | `minimum`, `maximum`, or `value` (integer) |
| `UNIT_CONSISTENCY_CHECK` | Checks that the unit is the same for all StatVars. | `stats` | None |
| `MIN_VALUE_CHECK` | Checks that the minimum value is not below a defined minimum. | `stats` | `minimum` (integer or float) |
| `MAX_VALUE_CHECK` | Checks that the maximum value is not above a defined maximum. | `stats` | `maximum` (integer or float) |
| MAX_VALUE_CHECK | Checks that the maximum value is not above a defined maximum. | `stats` | `maximum` (integer or float) |
| `GOLDENS` | Verifies that the data contains all records defined in a golden set. | `stats` | `golden_files` (list), `input_files` (list) |

### Golden Set Validation with `GOLDENS`

The `GOLDENS` validator ensures that your import contains a specific set of expected records. This is useful for verifying that critical StatVars, Places, or specific metadata combinations are always present in your output.

The validator compares the input data (usually from the `stats` data source) against one or more "golden" files (MCF or CSV).

#### Configuration Parameters
- `golden_files`: A list or glob pattern of golden MCF or CSV files to compare against.
- `goldens_key_property`: A list of properties to match on. If not specified, all properties in the golden record must match.
- `input_files`: (Optional) Path to specific input files. If not provided, the data source defined in the rule's `scope` is used.

#### GOLDENS Validator Example

**Rule:** "Ensure that observations for `Count_Person` and `Median_Age_Person` are present in the import as defined in our critical golden set."

```json
{
"rule_id": "verify_critical_obs",
"validator": "GOLDENS",
"scope": {
"data_source": "stats"
},
"params": {
"golden_files": ["goldens/critical_stats.csv"],
"input_files": "processed_obs.csv"
}
}
```

## Output


The framework generates a report file (specified by the `--validation_output` flag) with the results of each validation. The format of the report is determined by the file extension (`.csv` or `.json`).

### CSV Output
Expand Down
15 changes: 13 additions & 2 deletions tools/import_validation/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(self, validation_config_path: str, differ_output: str,
(self.validator.validate_min_value_check, 'stats'),
'MAX_VALUE_CHECK':
(self.validator.validate_max_value_check, 'stats'),
'GOLDENS': (self.validator.validate_goldens, 'stats'),
}

self._initialize_data_sources(stats_summary, lint_report, differ_output)
Expand Down Expand Up @@ -166,10 +167,20 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]:
validation_func, data_source_key = self.validation_dispatch[
validator_name]

rule_params = dict(rule.get('params', {}))
if rule_params:
# Add default parameters for output folder
output_dir = self.validation_output
if output_dir and not output_dir.endswith(
'/') and not os.path.isdir(output_dir):
output_dir = os.path.dirname(output_dir)
if output_dir:
rule_params.setdefault('output_path', output_dir)

if validator_name == 'SQL_VALIDATOR':
result = validation_func(self.data_sources['stats'],
self.data_sources['differ'],
rule['params'])
rule_params)
else:
scope = rule['scope']
if isinstance(scope, str):
Expand All @@ -185,7 +196,7 @@ def run_validations(self) -> tuple[bool, list[ValidationResult]]:
regex_patterns=variables_config.get('regex'),
contains_all=variables_config.get('contains_all'))

result = validation_func(df, rule['params'])
result = validation_func(df, rule_params)

result.name = rule['rule_id']
result.validation_params = rule.get('params', {})
Expand Down
66 changes: 66 additions & 0 deletions tools/import_validation/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,12 @@

_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(_SCRIPT_DIR)
_DATA_DIR = os.path.join(_SCRIPT_DIR.split('/data/')[0], 'data')
sys.path.append(os.path.join(_DATA_DIR, 'util'))

from result import ValidationResult, ValidationStatus
from counters import Counters
import validator_goldens


class Validator:
Expand Down Expand Up @@ -847,3 +851,65 @@ def validate_max_value_check(self, stats_df: pd.DataFrame,
'rows_succeeded': rows_succeeded,
'rows_failed': rows_failed
})

def validate_goldens(self, df: pd.DataFrame,
params: dict) -> ValidationResult:
"""Validates records against a golden set.

Args:
df: A DataFrame containing the data to validate (used if input_files
is not provided in params).
params: A dictionary containing:
'golden_files': Path(s) to golden MCF/CSV files.
'input_files': (Optional) Path(s) to input files. If not provided,
the 'df' will be used.
'output_path': (Optional) folder or output filename to save missing goldens.
And other optional validator_goldens config (e.g., goldens_key_property).

Returns:
A ValidationResult object.
"""
golden_files = params.get('golden_files')
if not golden_files:
return ValidationResult(
ValidationStatus.CONFIG_ERROR,
'GOLDENS',
message=
"Configuration error: 'golden_files' must be specified for GOLDENS validator."
)

try:
inputs = params.get('input_files')
if not inputs:
inputs = df.to_dict('index')
output_path = params.get('output_path')
# Compare nodes
counters = Counters()
missing_goldens = validator_goldens.validate_goldens(
inputs,
golden_files,
output_path,
config=params,
counters=counters)
details = {
name: value
for name, value in counters.get_counters().items()
if 'golden' in name
}
if not missing_goldens:
return ValidationResult(ValidationStatus.PASSED,
'GOLDENS',
details=details)
details['missing_goldens'] = missing_goldens

return ValidationResult(
ValidationStatus.FAILED,
'GOLDENS',
message=f"Found {len(missing_goldens)} missing golden records.",
details=details)

except Exception as e:
return ValidationResult(
ValidationStatus.DATA_ERROR,
'GOLDENS',
message=f"Error during golden validation: {e}")
Loading