diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml new file mode 100644 index 0000000..b69c4b0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug.yml @@ -0,0 +1,60 @@ +# See https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-issue-forms +# and https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-githubs-form-schema +name: Bug Report +description: Something is not working with the Lakeflow Framework +title: "[BUG]: " +labels: ["bug", "needs-triage"] +# assignees: +# - dlt_framework-write +body: + - type: checkboxes + attributes: + label: Is there an existing issue for this? + description: Please search to see if an issue already exists for the bug you encountered. + options: + - label: I have searched the existing issues + required: true + - type: textarea + attributes: + label: Current Behavior + description: | + A concise description of what you're experiencing. + **Do not paste links to attachments with logs and/or images, as all issues will attachments will get deleted.** + Use the `Relevant log output` field to paste redacted log output without personal identifying information (PII). + You can Ctrl/Cmd+V the screenshot, which would appear as a rendered image if it doesn't contain any PII. + validations: + required: false + - type: textarea + attributes: + label: Expected Behavior + description: A concise description of what you expected to happen. + validations: + required: false + - type: textarea + attributes: + label: Steps To Reproduce + description: Steps to reproduce the behavior. + placeholder: | + 1. In this environment... + 1. With this config... + 1. Run '...' + 1. See error... + validations: + required: false + - type: dropdown + id: channel + attributes: + label: Channel + description: What SDP Channel are you running on? + options: + - CURRENT + - PREVIEW + default: 0 + validations: + required: true + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..98ed3e3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,9 @@ +blank_issues_enabled: false +contact_links: + - name: General Databricks questions + url: https://help.databricks.com/ + about: Issues related to Databricks and not related to Lakeflow Framework + + - name: Lakeflow Framework Documentation + url: https://github.com/erik-seefeld_data/dlt_framework/tree/main/docs + about: Documentation about Lakeflow Framework \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature.yml b/.github/ISSUE_TEMPLATE/feature.yml new file mode 100644 index 0000000..ad465f3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature.yml @@ -0,0 +1,34 @@ +# See https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-issue-forms +# and https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/syntax-for-githubs-form-schema +name: Feature Request +description: This new feauture / functiona;ity is required for the Lakeflow Framework +title: "[FEATURE]: " +labels: ["enhancement", "needs-triage"] +# assignees: +# - dlt_framework-write +body: + - type: checkboxes + attributes: + label: Is there an existing issue for this? + description: Please search to see if an issue already exists for the feature request you're willing to submit + options: + - label: I have searched the existing issues + required: true + - type: textarea + attributes: + label: Problem statement + description: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + validations: + required: true + - type: textarea + attributes: + label: Proposed Solution + description: A clear and concise description of what you want to happen. + validations: + required: true + - type: textarea + attributes: + label: Additional Context + description: Add any other context, references or screenshots about the feature request here. + validations: + required: false \ No newline at end of file diff --git a/.github/workflows/develop-build.yml b/.github/workflows/develop-build.yml new file mode 100644 index 0000000..cbcb796 --- /dev/null +++ b/.github/workflows/develop-build.yml @@ -0,0 +1,37 @@ +name: Dev Version on Merge to Develop + +on: + pull_request: + branches: + - develop + types: + - closed + +jobs: + dev-version: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get latest tag + id: get_tag + run: | + TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + echo "Latest tag: $TAG" + echo "tag=$TAG" >> $GITHUB_OUTPUT + + - name: Generate pre-release version + run: | + TAG=${{ steps.get_tag.outputs.tag }} + VERS=${TAG#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$VERS" + PATCH=$((PATCH + 1)) + SHA=$(git rev-parse --short HEAD) + VERSION="${MAJOR}.${MINOR}.${PATCH}-dev.${SHA}" + echo $VERSION > VERSION + echo "Version: $VERSION" diff --git a/.github/workflows/loc.yml b/.github/workflows/loc.yml new file mode 100644 index 0000000..5ee27c5 --- /dev/null +++ b/.github/workflows/loc.yml @@ -0,0 +1,44 @@ +name: LOC Badge +on: + push: + branches: + - main + +jobs: + lines-of-code: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Count lines of code + run: | + curl -s https://raw.githubusercontent.com/AlDanial/cloc/master/cloc | perl - --csv --out=loc.csv . + LOC=$(tail -n1 loc.csv | cut -d',' -f5) + echo "Lines of code: $LOC" + + # Update README with LOC badge + - name: Update README.md with LOC badge + run: | + # Define the badge + LOC_BADGE="![Lines of Code](https://img.shields.io/badge/lines_of_code-${LOC}-blue)" + + # Check if README.md contains the badge + if grep -q "![Lines of Code]" README.md; then + # Update the existing badge + sed -i "s|!\[Lines of Code\](.*)|$LOC_BADGE|" README.md + else + # Add the badge at the end of README.md + echo -e "\n$LOC_BADGE" >> README.md + fi + + # Commit and push changes + - name: Commit and push changes + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git add README.md + git commit -m "Update Lines of Code badge" + git push + + diff --git a/.github/workflows/main-build.yml b/.github/workflows/main-build.yml new file mode 100644 index 0000000..097c42a --- /dev/null +++ b/.github/workflows/main-build.yml @@ -0,0 +1,59 @@ +name: Version and Tag on Merge to Main + +on: + pull_request: + branches: + - main + types: + - closed + +jobs: + tag-version: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 # So tags are available + + - name: Get latest tag + id: get_tag + run: | + LAST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + echo "LAST_TAG=$LAST_TAG" + echo "last_tag=$LAST_TAG" >> $GITHUB_OUTPUT + + - name: Compute next version + id: bump + run: | + OLD=${{ steps.get_tag.outputs.last_tag }} + VERS=${OLD#v} + IFS='.' read -r MAJOR MINOR PATCH <<< "$VERS" + PATCH=$((PATCH + 1)) + NEW="v$MAJOR.$MINOR.$PATCH" + echo "NEW_VERSION=$NEW" + echo "new_version=$NEW" >> $GITHUB_OUTPUT + + - name: Fail if version is not incremented + run: | + if [ "${{ steps.get_tag.outputs.last_tag }}" = "${{ steps.bump.outputs.new_version }}" ]; then + echo "❌ Version not incremented. Current tag (${{ steps.get_tag.outputs.last_tag }}) is the same as the new tag." + exit 1 + fi + + - name: Write VERSION file + run: | + echo "${{ steps.bump.outputs.new_version }}" | sed 's/^v//' > VERSION + cat VERSION + + - name: Tag and push + env: + TAG: ${{ steps.bump.outputs.new_version }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git config user.name "github-actions" + git config user.email "github-actions@github.com" + git tag $TAG + git push origin $TAG diff --git a/.github/workflows/main-docs.yml b/.github/workflows/main-docs.yml new file mode 100644 index 0000000..cf0e98d --- /dev/null +++ b/.github/workflows/main-docs.yml @@ -0,0 +1,58 @@ +name: Deploy Lakeflow Framework Documentation to GitHub Pages + +on: + push: + branches: ["main"] + +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + build: + runs-on: + group: databricks-solutions-protected-runner-group + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-docs.txt + + - name: Build HTML + run: | + cd docs + make html + + - name: Setup Pages + uses: actions/configure-pages@v4 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: 'docs/build/html' + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: + group: databricks-solutions-protected-runner-group + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index 4f25aa2..2ae69ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,10 @@ -*conf*.json +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md .DS_Store -__pycache__ -.idea/ -.env +**/.DS_Store diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..9ed0d0e --- /dev/null +++ b/.pylintrc @@ -0,0 +1,132 @@ +[MAIN] +ignore=CVS +ignore-paths= +ignore-patterns=^\.# +ignored-modules=pyspark, delta +py-version=3.10 +suggestion-mode=yes +unsafe-load-any-extension=no + + +[BASIC] +argument-naming-style=snake_case +attr-naming-style=snake_case +class-attribute-naming-style=any +class-const-naming-style=UPPER_CASE +class-naming-style=PascalCase +const-naming-style=UPPER_CASE +docstring-min-length=-1 +function-naming-style=snake_case +include-naming-hint=no +inlinevar-naming-style=any +method-naming-style=snake_case +module-naming-style=snake_case +property-classes=abc.abstractproperty +variable-naming-style=snake_case + + +[CLASSES] +check-protected-access-in-special-methods=no +defining-attr-methods=__init__, + __new__, + setUp, + asyncSetUp, + __post_init__ +exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit +valid-classmethod-first-arg=cls +valid-metaclass-classmethod-first-arg=mcs + + +[DESIGN] +max-args=7 +max-attributes=7 +max-bool-expr=5 +max-branches=12 +max-locals=15 +max-parents=7 +max-public-methods=20 +max-returns=6 +max-statements=50 +min-public-methods=0 + + +[FORMAT] +expected-line-ending-format= +ignore-long-lines=^\s*(# )??$ +indent-after-paren=4 +indent-string=' ' +max-line-length=120 +max-module-lines=1000 +single-line-class-stmt=no +single-line-if-stmt=no + + +[IMPORTS] +allow-reexport-from-package=no +allow-wildcard-with-all=no +known-third-party=enchant + + +[LOGGING] +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old +logging-modules=logging + + +[MESSAGES CONTROL] +disable= + C0114, # Missing module docstring + R0903, # Too few public methods + W0511, # Fixme comments + raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead +enable= + + +[METHOD_ARGS] +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] +notes=FIXME, + XXX, + TODO + + +[SIMILARITIES] +ignore-comments=yes +ignore-docstrings=yes +ignore-imports=yes +ignore-signatures=yes +min-similarity-lines=4 + + +[STRING] +check-quote-consistency=no + + +[TYPECHECK] +contextmanager-decorators=contextlib.contextmanager +ignore-none=yes +ignore-on-opaque-inference=yes +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace +missing-member-hint=yes +missing-member-hint-distance=1 +missing-member-max-choices=1 +mixin-class-rgx=.*[Mm]ixin +signature-mutators= + + +[VARIABLES] +allow-global-unused-variables=yes diff --git a/.style.yapf b/.style.yapf new file mode 100644 index 0000000..c440147 --- /dev/null +++ b/.style.yapf @@ -0,0 +1,9 @@ +[style] +based_on_style = pep8 +COLUMN_LIMIT = 120 +INDENT_WIDTH = 4 +USE_TABS = False +BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = True +SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = False +SPLIT_BEFORE_FIRST_ARGUMENT = True +SPLIT_ARGUMENTS_WHEN_COMMA_TERMINATED = True \ No newline at end of file diff --git a/.vscode/__builtins__.pyi b/.vscode/__builtins__.pyi new file mode 100644 index 0000000..0edd518 --- /dev/null +++ b/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..a3c059c --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,9 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "ms-python.pylint", + "eeyore.yapf", + "redhat.vscode-yaml" + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..6945bca --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,28 @@ +{ + "python.analysis.stubPath": ".vscode", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["src"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, + "python.envFile": "${workspaceRoot}/.env", + "pylint.enabled": true, + "[python]": { + "editor.formatOnSave": false, + "editor.defaultFormatter": "eeyore.yapf", + "editor.formatOnType": false + }, + "cursorpyright.analysis.extraPaths": [ + "src" + ], + "cursorpyright.analysis.stubPath": ".vscode", + "python.languageServer": "None", +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 7e1abb8..72be72c 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,28 @@ -# REPO NAME +# Databricks Lakeflow Framework -``` -Placeholder + + +[Documentation](https://databricks-solutions.github.io/lakeflow_framework/) | +[Sample Data Bundles](samples) + -Fill here a description at a functional level - what is this content doing -``` +## Project Description -## Video Overview +The Lakeflow Framework is a meta-data driven framework designed to: +- accelerate and simplify the deployment of Spark Declarative Pipelines, and support their deployment through your SDLC. +- support a wide variety of patterns across the medallion architecture for both batch and streaming workloads. -Include a GIF overview of what your project does. Use a service like Quicktime, Zoom or Loom to create the video, then convert to a GIF. +The Framework is designed for simplicity, performance and alignment to the Databricks Product Roadmap. The Framework is designed in such away to allow ease of maintenance and extensibility as the SDP product evolves. +## Documentation -## Installation - -Include details on how to use and install this content. +Please refer to the [documentation](./docs/_build/html/index.html) for further details and an explanation of the samples. +The documentation needs to be deployed as HTML or Markdown within your org before it can be used. ## How to get help Databricks support doesn't cover this content. For questions or bugs, please open a GitHub issue and the team will help on a best effort basis. - ## License © 2025 Databricks, Inc. All rights reserved. The source in this notebook is provided subject to the Databricks License [https://databricks.com/db-license-source]. All included or referenced third party libraries are subject to the licenses set forth below. - -| library | description | license | source | -|----------------------------------------|-------------------------|------------|-----------------------------------------------------| diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..fb7a04c --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +v0.4.0 diff --git a/databricks.yml b/databricks.yml new file mode 100644 index 0000000..9a81ea4 --- /dev/null +++ b/databricks.yml @@ -0,0 +1,24 @@ +# This is a Databricks asset bundle definition for dlt_framework. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: lakeflow_framework + +include: + - resources/*.yml + +variables: + version: + description: The framework version to deploy this bundle as + default: current + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + # We use 'mode: development' to indicate this is a personal development copy: + # - Deployed resources get prefixed with '[dev my_user_name]' + # - Any job schedules and triggers are paused by default + # - The 'development' mode is used for Delta Live Tables pipelines + mode: development + default: true + workspace: + root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target}/${var.version} diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..38c4ebf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,32 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -c . +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build +IMAGEDIR = source/images + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile markdown + +# Add custom markdown build command +md: + @$(SPHINXBUILD) -b markdown "$(SOURCEDIR)" "$(BUILDDIR)/markdown" $(SPHINXOPTS) $(O) + @echo "Copying images..." + @mkdir -p "$(BUILDDIR)/markdown/_images" + @cp -r "$(IMAGEDIR)"/* "$(BUILDDIR)/markdown/_images" + @echo "Copying stylesheets..." + @mkdir -p "$(BUILDDIR)/markdown/_static" + @cp -r "$(SOURCEDIR)/_static"/* "$(BUILDDIR)/markdown/_static" + @echo "Build finished. The markdown files are in $(BUILDDIR)/markdown." + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 0000000..5c747eb --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,10 @@ +{% extends "!layout.html" %} + +{% block content %} +
+ {% if last_updated %} +

Last updated: {{ last_updated }}

+ {% endif %} + {{ super() }} +
+{% endblock %} diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..dfe280e --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,57 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information +import os +import sys +sys.path.append(os.path.abspath(".")) # Ensure the script is discoverable +from custom_markdown_builder import CustomMarkdownTranslator + +project = 'Lakeflow Framework' +copyright = '2025, Databricks' +author = 'Erik Seefeld, Haille Woldegebriel, Amin Movahed' +release = '0.4.0' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + 'sphinx.ext.autosectionlabel', + 'sphinx_design', + 'myst_parser', + 'sphinx_tabs.tabs', + 'custom_markdown_builder' +] + +templates_path = ['source/_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output +import sphinx_rtd_theme + +intersphinx_mapping = { + 'rtd': ('https://docs.readthedocs.io/en/stable/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} + +html_theme = "sphinx_rtd_theme" +# html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +# html_theme_options = { +# 'collapse_navigation': True +# } + +html_static_path = ['source/_static'] +html_css_files = [ + 'custom.css', +] + +html_last_updated_fmt = "%B %d, %Y" # Example: January 30, 2025 + +def setup(app): + app.add_css_file('custom.css') + app.set_translator("markdown", CustomMarkdownTranslator) + #app.add_builder(MarkdownBuilder) diff --git a/docs/custom_markdown_builder.py b/docs/custom_markdown_builder.py new file mode 100644 index 0000000..2f166e2 --- /dev/null +++ b/docs/custom_markdown_builder.py @@ -0,0 +1,125 @@ +from sphinx_markdown_builder.translator import MarkdownTranslator +from sphinx_markdown_builder.builder import MarkdownBuilder +from sphinx_markdown_builder.writer import MarkdownWriter + +from docutils import nodes + +class CustomMarkdownTranslator(MarkdownTranslator): + """Custom Markdown Translator for Sphinx. Extends the default Markdown translator, only overriding specific directives.""" + + def __init__(self, document, builder): + super().__init__(document, builder) + self.has_raw_html = False + self.css_link_added = False + # Check if document contains any raw HTML nodes + for child in document.traverse(nodes.raw): + if 'html' in child.get('format', ''): + self.has_raw_html = True + break + + def visit_raw(self, node): + """Handle raw nodes, including CSS references""" + if 'html' in node.get('format', ''): + # Add CSS link before the first raw HTML content if not already added + if self.has_raw_html and not self.css_link_added: + self.add('\n\n') + self.css_link_added = True + text = node.astext() + self.add(text + '\n') + raise nodes.SkipNode + + def visit_admonition(self, node): + # Check if it's a 'note' block + classes = node.get("classes", []) + if "note" in classes: + self.add("\n> [!NOTE]\n") # Markdown blockquote with bold title + self.add("> " + "\n> ".join(line.strip() for line in node.astext().split("\n"))) + self.add("\n\n") + elif "important" in classes: + self.add("\n> [!IMPORTANT]\n") # Markdown blockquote with bold title + self.add("> " + "\n> ".join(line.strip() for line in node.astext().split("\n"))) + self.add("\n\n") + raise nodes.SkipNode + + def visit_warning(self, node): + """Convert .. warning:: directive to Markdown blockquote format""" + self.add("\n> [!WARNING]\n") # Markdown blockquote with bold title + self.add("> " + "\n> ".join(line.strip() for line in node.astext().split("\n"))) + self.add("\n\n") + raise nodes.SkipNode + + def visit_note(self, node): + """Convert .. note:: directive to Markdown blockquote format""" + self.add("\n> [!NOTE]\n") # Markdown blockquote with bold title + self.add("> " + "\n> ".join(line.strip() for line in node.astext().split("\n"))) + self.add("\n\n") + raise nodes.SkipNode + + # def visit_enumerated_list(self, node): + # """Handle enumerated lists (numbered or lettered)""" + # enumtype = node.get('enumtype', 'arabic') + # start = node.get('start', 1) + + # if enumtype == 'arabic': + # self.enumerated_list_style.append(('1', start)) + # elif enumtype == 'loweralpha': + # self.enumerated_list_style.append(('a', string.ascii_lowercase.index(str(start)) if str(start).isalpha() else 0)) + # elif enumtype == 'upperalpha': + # self.enumerated_list_style.append(('A', string.ascii_uppercase.index(str(start)) if str(start).isalpha() else 0)) + # else: + # self.enumerated_list_style.append(('1', start)) # Default to numbers + + # self.list_depth += 1 + + # def depart_enumerated_list(self, node): + # """Handle end of enumerated lists""" + # self.list_depth -= 1 + # if self.enumerated_list_style: + # self.enumerated_list_style.pop() + + # def visit_list_item(self, node): + # """Handle list items with proper formatting""" + # self._start_list_item(node) + + # def depart_list_item(self, node): + # self.add("\n") + + # def visit_block_quote(self, node): + # self.add(">"+" " * self.indent_level) + + # def visit_topic(self, node): + # self.add(" " * self.indent_level) + # self.add("\n".join(line.strip() for line in node.astext().split("\n"))) + + def visit_important(self, node): + """Convert .. important:: directive to Markdown blockquote format""" + self.add("\n> [!IMPORTANT]\n") # Markdown blockquote with bold title + self.add("> " + "\n> ".join(line.strip() for line in node.astext().split("\n"))) + self.add("\n\n") + raise nodes.SkipNode + + def visit_RawHtmlNode(self, node): + self.add("\n" + node.astext() + "\n") # Insert raw HTML directly + raise nodes.SkipNode # Skip further processing + +class CustomMarkdownWriter(MarkdownWriter): + translator_class = CustomMarkdownTranslator + +class CustomMarkdownBuilder(MarkdownBuilder): + name = 'markdown' # Override the default markdown builder + writer_class = CustomMarkdownWriter + +def setup(app): + """Register the custom markdown builder with Sphinx.""" + # Register all required config values + app.add_config_value('markdown_http_base', '', 'env') + app.add_config_value('markdown_uri_doc_suffix', '.md', 'env') + app.add_config_value('markdown_anchor_sections', True, 'env') + app.add_config_value('markdown_docinfo', False, 'env') + app.add_config_value('markdown_bullet', '*', 'env') + app.add_builder(CustomMarkdownBuilder) + return { + 'version': '0.1', + 'parallel_read_safe': True, + 'parallel_write_safe': True, + } \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..a875c43 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,49 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build +set IMAGEDIR=source\_images + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help +if "%1" == "markdown" goto markdown + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:markdown +echo Building markdown files... +%SPHINXBUILD% -b markdown %SOURCEDIR% %BUILDDIR%\markdown %SPHINXOPTS% %O% +echo Copying images... +if not exist %BUILDDIR%\markdown\_images mkdir %BUILDDIR%\markdown\_images +xcopy /s /y %IMAGEDIR%\* %BUILDDIR%\markdown\_images\ +echo Copying stylesheets... +if not exist %BUILDDIR%\markdown\_static mkdir %BUILDDIR%\markdown\_static +xcopy /s /y %SOURCEDIR%\_static\* %BUILDDIR%\markdown\_static\ +echo Build finished. The markdown files are in %BUILDDIR%\markdown. +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css new file mode 100644 index 0000000..a34b246 --- /dev/null +++ b/docs/source/_static/custom.css @@ -0,0 +1,130 @@ +.rst-content li li { + padding-top: 0em; + padding-bottom: 0.5em; + margin-top: 0em; +} + +.rst-content li p{ + padding-top: 0em; + padding-bottom: 0em; + margin-top: -0.5em; +} +.rst-content li { + padding-top: 0.0em; + padding-bottom: 0.2em; +} + +/* Wrap text and prevent horizontal scrolling in tables */ +.rst-content table.docutils td { + white-space: normal; + table-layout: fixed; + vertical-align: top; +} + +.rst-content table.docutils tr.highlight-row td, +.highlight-cell { + background-color: green !important; + color: white !important; +} + +/* Adjust width of main content column */ +.wy-nav-content { + max-width: 1000px; +} + +.last-updated { + font-style: italic; + color: #888; + font-size: 0.9em; +} + +/* markdown styles for raw html */ +.highlight-row { + background-color: green !important; + color: white !important; +} + +/* Custom styling for sphinx-tabs */ +.sphinx-tabs { + margin-bottom: 1rem; +} + +.sphinx-tabs-tab { + background-color: #f3f6f6; + color: #404040; + padding: 0.5rem 1rem; + border: 1px solid #e1e4e5; + border-bottom: none; + font-weight: 500; + font-size: 0.9rem; +} + +.sphinx-tabs-tab[aria-selected="true"] { + background-color: #fff; + color: #2980b9; + border-bottom: 2px solid #2980b9; +} + +.sphinx-tabs-panel { + background-color: #f8f8f8 !important; + border: 1px solid #e1e4e5; + border-top: none; + padding: 0; + margin-bottom: 0; +} + +/* Remove borders from code blocks inside tabs */ +.sphinx-tabs-panel .highlight { + margin: 0; + margin-bottom: 0; + border: none !important; + box-shadow: none !important; +} + +.sphinx-tabs-panel pre { + margin: 0; + margin-bottom: 0; + border-radius: 0; + border: none !important; + box-shadow: none !important; +} + +/* Remove box shadow from code blocks inside tabs */ +.sphinx-tabs-panel .highlight-default { + border: none !important; +} + +/* Target all highlight classes */ +.sphinx-tabs-panel div[class^="highlight-"] { + border: none !important; + box-shadow: none !important; +} + +/* Target code and pre elements */ +.sphinx-tabs-panel code, +.sphinx-tabs-panel pre code { + border: none !important; +} + +/* Remove border from literal blocks */ +.sphinx-tabs-panel .literal-block-wrapper { + border: none !important; +} + +.sphinx-tabs-panel .code-block-caption { + border: none !important; +} + +/* Remove padding from the last child in tab panel */ +.sphinx-tabs-panel > :last-child { + margin-bottom: 0; + padding-bottom: 0; +} + +.sphinx-tabs-panel .highlight:last-child { + margin-bottom: 0; +} + +.sphinx-tabs-panel .highlight pre { + padding-bottom: 12px; +} \ No newline at end of file diff --git a/docs/source/_static/markdown.css b/docs/source/_static/markdown.css new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/build_pipeline_bundle.rst b/docs/source/build_pipeline_bundle.rst new file mode 100644 index 0000000..052e245 --- /dev/null +++ b/docs/source/build_pipeline_bundle.rst @@ -0,0 +1,13 @@ +Build and Deploy Pipelines +========================== + +.. toctree:: + :maxdepth: 1 + + build_pipeline_bundle_structure + build_pipeline_bundle_steps + deploy_pipeline_bundle + pipeline_execution + Patterns: Data Flows and Pipelines + + \ No newline at end of file diff --git a/docs/source/build_pipeline_bundle_steps.rst b/docs/source/build_pipeline_bundle_steps.rst new file mode 100644 index 0000000..7aa5919 --- /dev/null +++ b/docs/source/build_pipeline_bundle_steps.rst @@ -0,0 +1,305 @@ +Building a Pipeline Bundle +########################## + +Prerequisites +============= + +- The Lakeflow Framework must be deployed. See :doc:`deploy_framework` for details. +- Ensure you have autocomplete for Data Flow Specs configured. See :doc:`feature_auto_complete` for details. +- Understanding of the core concepts of the Framework. See :doc:`concepts` for details. + +Steps to build a Pipeline Bundle +================================ + +1. Create a new Pipeline Bundle +------------------------------- + +A new Pipeline Bundle can be created using the following methods. + +* **Copy Pipeline Bundle Template:** + + You can copy the ``pipeline_bundle_template`` bundle provided with the framework. The bundle is located in the root directory of the Framework Repository. + +* **Databricks CLI - Initialize Blank Bundle:** + + .. note:: + The following steps assume that you have the Databricks CLI installed and configured. If not, please refer to the `Databricks CLI documentation `_. + + You can create a new DABs bundle from the command line by executing the command: + + .. code-block:: bash + + databricks bundle init + + This will create a new DABs bundle with the following structure: + + :: + + my_pipeline_bundle/ + ├── fixtures/ + ├── resources/ + ├── scratch/ + │ ├── exploration.ipynb + │ └── README.md + ├── databricks.yml + └── README.md + + + Modify the bundle to have the following structure without the files: + + :: + + my_pipeline_bundle/ + ├── fixtures/ + ├── resources/ + │ └── my_first_pipeline.yml + ├── scratch/ + ├── src/ + │ ├── dataflows + │ └── pipeline_configs + ├── databricks.yml + └── README.md + +* **Databricks CLI - Initialize Bundle Using a Custom Template:** + + .. note:: + This method assumes that: + * You have the Databricks CLI installed and configured. If not, please refer to the `Databricks CLI documentation `_. + * You have a custom template file that you want to use to initialize the bundle. Refer to the `Databricks CLI documentation `_ for more information on how to create a custom template. + * A custom template should be maintained centrally, discuss this with your platform team. + + You can create a new DABs bundle from the command line by executing the command: + + .. code-block:: bash + + databricks bundle init + +* **Copy an Existing Pipeline Bundle:** + + You can always copy an existing Pipeline Bundle to use as a starting point for a new Pipeline Bundle. If doing this bear in mind that you may need to: + + - Reset the targets and parameter in the ``databricks.yml`` file + - Clean out the following folders: ``resources``, ``src/dataflows`` and ``src/pipeline_configs`` + +2. Update the ``databricks.yml`` File +------------------------------------- + +The databricks.yml needs to be adjusted to include the following configurations: + +.. code-block:: yaml + + bundle: + name: bundle_name + + include: + - resources/*.yml + + variables: + owner: + description: The owner of the bundle + default: ${workspace.current_user.userName} + catalog: + description: The target UC catalog + default: main + schema: + description: The target UC schema + default: default + layer: + description: The target medallion layer + default: bronze + + + targets: + dev: + mode: development + default: true + workspace: + host: https://.databricks.com/ + variables: + framework_source_path: /Workspace/Users/${var.owner}/.bundle/nab_dlt_framework/dev/files/src + +.. note:: + * The ``framework_source_path`` variable should point to the location of where the Lakeflow Framework bundle is deployed in the Databricks workspace. + * By default the Lakeflow Framework Bundle is deployed to the owner's (person deploying the bundle) workspace files how folder under the ``.bundle///files/`` directory. + * The ``owner`` can either be passed via the command line or via your CI/CD tool to allow deployment to the appropriate workspace files location in the given deployment context. See the :doc:`deploy_pipeline_bundle` section for more information. + +3. Select your Bundle Structure +------------------------------- + +Based on the Use Case and the standards defined in your Org, select the appropriate bundle structure. See the :doc:`build_pipeline_bundle_structure` section for guidance. + +4. Select your Data Flow Specification Language / Format +------------------------------------------------------- + +Based on the implementation and standards in your Org, you can select the appropriate specification language / format. See the :doc:`feature_spec_format` section details. + +Be aware that: +- The default format is `JSON`. +- The format may have alrteady been enforced globally at Framework level per you orgs standards. +- If enabled at Framework level, you can set the format at the Pipeline Bundle level. +- You cannot mix and match formats in the same bundle, it's important to ensure consistency for engineers working on the same bundle. + +5. Setup your Substitutions Configuration +----------------------------------------- + +If you haven't already done so, familiarize yourself with the :doc:`feature_substitutions` feature of the Framework. + +If you need to use substitutions and the substitutions you require have not been configure globally at the Framework level, you need to now setup your substitutions file. See the :doc:`substitutions` section for guidance. + +.. note:: + This step is optional and only required if substitutions are required to deploy the same pipeline bundle to multiple environments with different resources names. This step can also be actioned later in the build process after the Data Flow Specs have been created. + +6. Build your Data Flows +------------------------ + +Iterate over the following steps to create each individual Data Flow: + +1. **Understand your Use Case:** + + In this step you will need to make two selections: + a. The Data Flow Type: Standard or Flows + #. Select from an existing pattern or create a new one. Refer to the :doc:`patterns` section for more information on the different patterns. + + To make these selections you need to consider the following: + a. What layer of the Lakehouse will the Data Flow read from and write to? + #. Is this a streaming or batch data flow? + #. Is my target table SCD0, SCD1 or SCD2? + #. Are there Data Quality rules that need to be applied to the data? + #. How many source tables are there, what join strategy is require and do the tables share common keys and sequence columns? + #. Are there any transformations required and if so what type and complexity? + #. What are the latency / SLA requirements? + +2. **Update your Substitutions Configuration:** + + If necessary add any substitutions required for your Data Flow to the substitutions file. + +3. **Build the Data Flow Spec:** + + a. Create a sub-directory per you selected bundle structure: + + If necessary, create a new folder in the ``src/dataflows`` directory based on your selected bundle strategy. + + b. Create Data Flow Spec file(s): + * Refer to the :doc:`dataflow_spec_reference` section to build your Data Flow Spec + * Refer to the :doc:`patterns` section for high level patterns and sample code. + * Refer to the :doc:`deploy_samples` section on how to deploy the samples so you can reference the sample code. + + c. Create schema JSON / DDL files(s): + + Create your schema JSON / DDL files in the ``schema`` sub-directory of your Data Flow Spec's home folder: + + * You should in general always specify a schema for your source and target tables, unless you want schema evolution to happen automatically in Bronze. + * Schemas are optional for staging tables. + * Each schema must be defined in it's own individual file. + * Each schema must be referenced by the appropriate object(s) in your Data Flow Spec JSON file(s). + + A schema file must have a format similar to the below example: + + The schema specification can be found in the :doc:`feature_schemas` section. + + d. Create SQL Transform file(s): + + If you have transforms in your Data Flow, you will need to create a SQL file for each transform, in the ``dml`` sub-directory of your Data Flow Spec's home folder. + + e. Create Data Quality Expectations file(s): + + If you have data quality expectations in your Data Flow, you will need to create an expectations file for your target table in the ``expectations`` sub-directory of your Data Flow Spec's home folder. + + Refer to the :doc:`feature_data_quality_expectations` section for guidance on how to create an expectations file. + +7. Create your Pipeline Definitions +----------------------------------- + +Spark Declarative Pipelines are defined in the ``resources`` directory. Each Pipeline is defined in it's own individual YAML file. + +DAB's will use these YAML files to create the Spark Declarative Pipelines in the target Databricks Workspace. + +How many pipeline resource files you create and how you configure them will be based on the Bundle Structure you have selected. + +To create a single Pipeline definition, follow these steps: + +1. **Create a resource YAML file:** + + Create a new YAML file in the ``resources`` directory. Name it after the Pipeline you want to create. + +2. **Add the Base YAML Definition:** + + Add the following base content to the file, replacing the ```` tags in the highlighted rows with the appropriate values for your Pipeline. + + .. code-block:: yaml + :emphasize-lines: 3, 4 + + resources: + pipelines: + : + name: + catalog: ${var.catalog} + schema: ${var.schema} + channel: CURRENT + serverless: true + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: /Workspace/${workspace.file_path}/src + framework.sourcePath: /Workspace/${var.framework_source_path} + workspace.host: ${workspace.host} + bundle.target: ${bundle.target} + pipeline.layer: ${var.layer} + +3. **Add any required Data Flow filters:** + + By default, if you don't specify any Data Flow filters, the pipleine will execute all Data Flows in you Pipeline Bundle. + + If you are creating more than one Pipeline definition in your bundle, you may want your Pipeline(s) to only execute specific Data Flows. + + The Framework provides number of ways to filter the Data Flows a pipeline executes. These can be set per the configuration options described below: + + .. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Configuration Option + - Description + * - ``pipeline.dataFlowIdFilter`` + - The ID(s) of the data flow to include in the pipeline + * - ``pipeline.dataFlowGroupFilter`` + - The data flow group(s) to include in the pipeline + * - ``pipeline.flowGroupIdFilter`` + - The ID's of the flow group(s), in a data flow, to include in the pipeline + * - ``pipeline.fileFilter`` + - The file path for the data flow to include in the pipeline + * - ``pipeline.targetTableFiler`` + - The target table(s) to include in the pipeline + + .. note:: + For all the above filter fields, the values can be a single value or multiple values separated by a comma. + + You can add the appropriate Data Flow filter options described above to the Pipeline definition, as show below: + + .. code-block:: yaml + :emphasize-lines: 20-23 + + resources: + pipelines: + : + name: + catalog: ${var.catalog} + schema: ${var.schema} + channel: CURRENT + serverless: true + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: /Workspace/${workspace.file_path}/src + framework.sourcePath: /Workspace/${var.framework_source_path} + workspace.host: ${workspace.host} + bundle.target: ${bundle.target} + pipeline.layer: ${var.layer} + pipeline.targetTableFiler: + pipeline.dataFlowIdFilter: + pipeline.flowGroupIdFilter: + pipeline.fileFilter: diff --git a/docs/source/build_pipeline_bundle_structure.rst b/docs/source/build_pipeline_bundle_structure.rst new file mode 100644 index 0000000..a48379b --- /dev/null +++ b/docs/source/build_pipeline_bundle_structure.rst @@ -0,0 +1,156 @@ +Bundle Scope and Structure +########################## + +When creating a Pipeline Bundle, it is important to decide on the scope and structure of the bundle. + +This will be informed by the following factors: + +* Your organizational structure. +* Your operational standards, practices and your CI/CD processes. +* The size and complexity of your data estate. +* The Use Case +* The Layer of your Lakehouse you are targeting + +Ultimately you will need to determine the best way to scope your Pipeline Bundles for your deployment. + +.. important:: + + Per the :ref:`concepts` section of this documentation: + + * A data flow, and its Data Flow Spec, defines the source(s) and logic required to generate a **single target table**. + * A Pipeline Bundle can contain multiple Data Flow Specs, and a Pipeline deployed by the bundle may execute the logic for one or more Data Flow Specs. + + For the above reasons **the smallest possible unit of logic that can be deployed by a Pipeline Bundle is a single Pipeline, executing a single data flow, that populates a single target table**. + +Bundle Scope +============ + +Bundle Scope simply refers to the level of the logical grouping of Data Flows and Pipeline resources within a Pipeline Bundle. + +Some of the most common groupings strategies are shown below: + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Logical Grouping + - Description + * - Monolithic + - A single Pipeline Bundle containing all ata flows and Pipeline definitions. Only suitable for smaller and simpler deployments. + * - Bronze + - * Source System - A Pipeline per Source System or application + * - Silver / Enterprise Models + - * Subject Area / Sub-Domain - A Pipeline per Subject Area, or Sub-Domain + * Use Case - A Pipeline per Use Case + * Target Table - A Pipeline per target table, the most granular level for complex data flows + * - Gold / Dimensional Models + - * Data Mart - A Pipeline per Data Mart + * Common Dimensions - A Pipeline for your Common Dimensions + * Target Table - A Pipeline for complex Facts or target tables. + * - Use Case + - You may choose to have an end to end pipeline for given Use Cases + +Once you have determined the scope of your Pipeline Bundle, you can move on to determining its structure. + +Bundle Structure +================= + +The high-level structure of a Pipeline Bundle never changes and is as follows: + +:: + + my_pipeline_bundle/ + ├── fixtures/ + ├── resources/ + │ └── my_first_pipeline.yml + ├── scratch/ + ├── src/ + │ ├── dataflows + │ └── pipeline_configs + ├── databricks.yml + └── README.md + +.. note:: + + Refer to the :doc:`concepts` section for more details on the different components of a Pipeline Bundle. + +It is the structure of the ``src/dataflows`` directory that is flexible and can be organised in the way that best suits your standards and ways of working. The Framework will: + +* Read all the Data Flow Spec files under the ``src/dataflows`` directory, regardless of the folder structure. Filtering of the Dataflows is done when defining your Pipeline and is discussed in the :doc:`build_pipeline_bundle_steps` section. +* Expect that the schemas, transforms and expectations related to a Data Flow Spec are located in their respective ``schemas``, ``dml`` and ``expectations`` sub-directories within the Data Flow Spec's home directory. + +The most common ways to organize your ``src/dataflows`` directory are: + +1. **Flat:** + + :: + + my_pipeline_bundle/ + ├── src/ + │ ├── dataflows + │ │ ├── table_1_data_flow_spec_main.json + │ │ ├── table_2_data_flow_spec_main.json + │ │ ├── dml + │ │ │ ├── table_1_tfm.sql + │ │ │ ├── table_2_tfm_1.sql + │ │ │ └── table_2_tfm_2.sql + │ │ ├── expectations + │ │ │ └── table_2_dqe.json + │ │ ├── python_functions + │ │ └── schemas + │ │ ├── table_1.json + │ │ └── table_2.json + +2. **By Use Case:** + + :: + + my_pipeline_bundle/ + ├── src/ + │ ├── dataflows + │ │ ├── use_case_1 + │ │ │ ├── table_1_data_flow_spec_main.json + │ │ │ ├── table_2_data_flow_spec_main.json + │ │ │ ├── dml + │ │ │ │ ├── table_1_tfm.sql + │ │ │ │ ├── table_2_tfm_1.sql + │ │ │ │ └── table_2_tfm_2.sql + │ │ │ ├── expectations + │ │ │ ├── python_functions + │ │ │ └── schemas + │ │ │ ├── table_1.json + │ │ │ └── table_2.json + │ │ └── use_case_2 + │ │ ├── table_1_data_flow_spec_main.json + │ │ ├── table_2_data_flow_spec_main.json + │ │ ├── dml + │ │ ├── expectations + │ │ ├── python_functions + │ │ └── schemas + +3. **By Target Table:** + + :: + + my_pipeline_bundle/ + ├── src/ + │ ├── dataflows + │ │ ├── table_1 + │ │ │ ├── table_1_data_flow_spec_main.json + │ │ │ ├── dml + │ │ │ │ ├── table_1_tfm.sql + │ │ │ ├── expectations + │ │ │ ├── python_functions + │ │ │ └── schemas + │ │ │ └── table_1.json + │ │ └── table_2 + │ │ ├── table_2_data_flow_spec_main.json + │ │ ├── dml + │ │ │ ├── table_2_tfm_1.sql + │ │ │ └── table_2_tfm_2.sql + │ │ ├── expectations + │ │ │ └── table_2_dqe.json + │ │ ├── python_functions + │ │ └── schemas + │ │ └── table_2.json + │ └── pipeline_configs diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst new file mode 100644 index 0000000..6295f08 --- /dev/null +++ b/docs/source/concepts.rst @@ -0,0 +1,500 @@ +Framework Concepts +################## + +The purpose of the Framework is to provide a standard metadata driven approach to creating Databricks Spark Declarative Pipelines. + +The below diagram illustrates some of the key concepts of the Framework, which are explained in more detail in the following sections. + +.. image:: images/framework_concept_overview.png + :target: _images/framework_concept_overview.png + :alt: Framework Concept Overview + +.. _concepts_dabs: + +Databricks Asset Bundles (DABs) +=============================== +Databricks Asset Bundles (DABs) are a way to package and deploy Databricks assets such as source code, Spark Declarative Pipelines notebooks and libraries. +This concept is core to how the Lakeflow Framework has been designed and implemented. + +Detailed documentation on DABs can be found at: https://docs.databricks.com/en/dev-tools/bundles/index.html + + +.. _concepts_bundle_types: + +Bundle Types +============ + +The Lakeflow Framework is composed of two bundle types + +* **Framework Bundle** + + The Framework Bundle contains the core framework source code and configuration. + +* **Pipeline Bundles** + + Pipeline Bundles contain: + + * **Data Flow Specs** - A collection of files that define the execution logic of a Pipeline; these are a core concept which are explained further below in :ref:`concepts_dataflow_specs`. + * **Pipeline Definitions** - YAML files that define the Pipeline configuration. + + +.. _concepts_framework_bundle: + +Framework Bundle +---------------- + +The Framework Bundle contains: + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Component + - Description + * - **Framework Source Code** + - The core framework source code under the ``src`` folder. + * - **Global Framework Configuration** + - The global framework configuration under the ``src/config`` folder. + * - **Data Flow Spec Schema Definition** + - The Data Flow Spec schema definition and validations under the ``src/schemas`` folder. + * - **Deployment YAML file** + - The ``databricks.yml`` file which defines the bundle deployment settings. + + +The Framework Bundle is deployed to a given workspace files location from where it's source code can be referenced by any deployed Pipelines. + +.. _concepts_framework_settings: + +Framework Configuration +~~~~~~~~~~~~~~~~~~~~~~~ + +The Framework and most of its features will have configuration settings that can be set in the Framework Bundle ``src/config`` folder. The configuration settings are explained in the section: :doc:`features` + +.. admonition:: Setting Precedence + :class: note + + * Where settings can only be configured in the Framework Bundle, they can not be altered by a Pipeline Bundle. + * Where settings can be configured in both the Framework Bundle and the Pipeline Bundles, Pipeline Bundle settings will take precedence. + +.. _concepts_pipeline_bundles: + +Pipeline Bundles +---------------- +A Pipeline Bundle is used to define and deploy one or more Spark Declarative Pipelines. + +A Pipeline Bundle contains: + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Component + - Description + * - **Data Flow Specs** + - Located under the ``src/dataflows`` folder. These are a core concept which are explained further below in :ref:`concepts_dataflow_specs`. + * - **Pipeline Definitions** + - YAML files in the ``resources`` folder. These are used to create the Spark Declarative Pipelines in the target workspace. + * - **Pipeline Level Framework Configuration** + - Configuration files under the ``src/pipeline_configs`` folder. + * - **Deployment YAML file** + - The ``databricks.yml`` file which defines the bundle deployment settings. + +When Pipeline Bundles are deployed, DABs will: + +1. Deploy the bundle to a given workspace files location from where its source code can be referenced by any Pipelines deployed by the bundle. +2. Create a Spark Declarative Pipeline per YAML file in the ``resources`` folder of the Pipeline Bundle. + +Pipeline Bundle Structure: + +:: + + my_pipeline_bundle/ + ├── databricks.yml + ├── fixtures/ + ├── resources/ + ├── scratch/ + ├── src/ + │ ├── dataflows/ + │ └── pipeline_configs/ + └── README.md + +.. _concepts_dataflow_specs: + +Data Flow Specs +~~~~~~~~~~~~~~~ +A Data Flow Spec is a collection of files that define a single data flow that creates, maintains and loads a single target table. + +A Data Flow Spec contains the following files: + + .. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - File Type + - Description + * - **Data Flow Specification** + - One or more files that define the execution logic required to generate a target table. + * - **Schema JSON** + - One or more files that define the schema of the source tables, staging tables and target table in the Pipeline. + * - **Expectations** + - One or more files that specify the data quality expectations for the given target table. + * - **SQL Transforms** + - One or more SQL files that can be used to specify transformations in the pipeline. + +.. important:: + + * A Data Flow Spec defines the source(s) and logic required to generate a single target table + * A Pipeline Bundle can contain multiple Data Flow Specs, and a Pipeline deployed by the bundle may execute the logic for one or more Data Flow Specs. + +Example Data Flow Spec Structure: + +:: + + my_pipeline_bundle/ + └── src/ + ├── dataflows + │ ├── dataflowspec + │ │ └── table_1.json + │ ├── dml + │ │ └── transform.sql + │ ├── expectations + │ │ └── table_1_expectations.json + │ └── schemas + │ └── source_table_schema.json + │ └── table_1_schema.json + └── pipeline_configs + └── pipeline_config.json + +.. _concepts_pipeline_resources: + +Pipeline Resource YAML Files +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +A Pipeline Bundle, must contain at least one YAML file in the ``resources`` folder that defines a Pipeline configuration; it may contain multiple YAML files, depending on the number of pipelines to be defined. + +The resource YAML files are used by DABs to create, update or destroy the Spark Declarative Pipelines in the target workspace. + +.. _concepts_pipeline_bundle_config: + +Pipeline Bundle Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Certain Framework features can be configured in a Pipeline Bundle, in the ``src/pipeline_configs`` folder. These configuration settings are explained in the section: :doc:`features` + +.. _concepts_dataflow_types: + +Pipelines +========= + +A Spark Declarative Pipeline is essentially an ETL pipeline that executes on the Databricks platform. Pipeline bundles provide a standard way to define and deploy Spark Declarative Pipelines. + +.. image:: images/pipeline_concepts_overview.png + :target: _images/pipeline_concepts_overview.png + :alt: Pipeline Concepts Overview + +The above diagram illustrates the following key concepts: + +1. Pipeline Bundles define: + + * Spark Declarative Pipelines + * The logic that Spark Declarative Pipelines execute + * The staging and target tables that Spark Declarative Pipelines maintain + +2. A Spark Declarative Pipeline is defined in a resource YAML file of a Pipeline Bundle. +3. Once the Pipeline Bundle is deployed, the Spark Declarative Pipeline is created in the target workspace with the following key settings: + + * The workspace files location of the entry point Notebook for the Framework. + * The workspace files location of the Data Flow Spec source folder for the Pipeline Bundle. + * Settings that define which data flows to execute when the pipeline is deployed. + +4. When a Spark Declarative Pipeline executes, the following steps are performed: + +.. list-table:: + :widths: 10 30 60 + :header-rows: 1 + + * - Step + - Name + - Description + * - 1 + - Load and Initialize Framework + - Load and initialize the Framework + * - 2 + - Retrieve Data Flow Specifications + - + a. **Retrieve and validate:** + - Read and validate ALL the Data Flow Specifications, Expectations, and Secrets Configurations from the workspace files location of the Pipeline Bundle. + - If a file is not valid it will be added to an error list. + - If any files failed validation, the pipeline will fail and the user will receive a list of validation errors. + b. **Apply pipeline filters:** + - The framework will apply any pipeline filters to the in memory dictionary. + - The only exception to this is the File Filter which means the framework will specifically only read that file(s). + * - 3 + - Generate Pipeline Definition + - The Framework will then use the in memory dictionary to initialize the Spark Declarative Pipeline. + * - 4 + - Execute Pipeline + - The pipeline will then execute the logic defined in the Data Flow Specifications. + +.. _concepts_data_flows: + +Data Flows & Data Flow Specs +============================ + + +A data flow defines the source(s) and logic required to generate a single target table. Data flows are defined using Data Flow Specs. + +.. admonition:: Data Flows as Building Blocks + + Each data flow and the components therein can be thought of as a building blocks to build out your overall Pipelines. + +In the Lakeflow Framework the following types of data flows can be defined in a Data Flow Spec: + +1. **Standard** + + A standard data flow is a more simple data flow type that allows a pipeline to be defined with a single source view (which may implement SQL or joins but would typically be over a single table) and target table. + They are simple to define and are useful for ingestion and Bronze scenarios. + +2. **Flow** + + Flows data flows allow you to create simple or complex data flows, using the different components of a flow as building blocks. They implement the :doc:`feature_multi_source_streaming` feature of DLT. + Flows are useful for Silver and Gold scenarios, and where multiple sources and transformations are required. + +3. **Materialized Views** + + Materialized Views are the precomputed results of a query stored in a Table. They are useful for Gold scenarios, and where complex transformations are required. + +.. important:: + + * A data flow is defined by a Data Flow Spec + * A data flow defines the source(s) and logic required to generate a single target table + * A Pipeline Bundle can contain multiple Data Flow Specs, and a Pipeline deployed by the bundle may execute the logic for one or more Data Flow Specs. + +.. _concepts_standard_data_flow: + +Standard Data Flow +------------------- + +Data Flow Spec Components: + +.. code-block:: text + + Standard Data Flow + ├── Data Flow Metadata + ├── Source Details + ├── Target Details + ├── CDC Details (optional) + ├── Expectations (optional) + ├── Quarantine Details (optional) + └── Table Migration Details (optional) + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Component + - Description + * - :ref:`Data Flow Metadata ` + - Defines the metadata for the data flow. + * - :ref:`Source Details ` + - Specifies the source type, source view and its properties. + * - :ref:`Target Details ` + - Specifies the target table, its configuration and properties. + * - :ref:`CDC Details (optional) ` + - Enables the CDC flows to populate the target table. + * - :ref:`Expectations (optional) ` + - Enable expectations and specify the location of the expectations file(s). + * - :ref:`Quarantine Details (optional) ` + - Set the quarantine mode and if the mode is ``table`` the details of the quarantine table. + * - :ref:`Table Migration Details (optional) ` + - The details of the table being migrated from. + +.. _concepts_flows_data_flow: + +Flows Data Flow +--------------- + +Data Flow Spec Components: + +.. code-block:: text + :emphasize-lines: 8 + + Flows Data Flow + ├── Data Flow Metadata + ├── Target Details + ├── CDC Details (optional) + ├── Expectations (optional) + ├── Quarantine Details (optional) + |── Table Migration Details (optional) + └── Flow Groups + ├── Flow Group 1 + └── Flow Group n + ├── Staging Tables (optional) + | ├── Staging Table 1 + | └── Staging Table n + | ├── Target Details + | └── CDC Details (optional) + └── Flows + ├── Flow 1 + └── Flow n + ├── Flow Type + ├── Flow Details + └── Views (optional) + ├── View 1 + └── View n + ├── Mode + └── Source Details + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Component + - Description + * - :ref:`Data Flow Metadata ` + - Defines the metadata for the data flow. + * - :ref:`Target Details ` + - Specifies the target table, its configuration and properties. + * - :ref:`CDC Details (optional) ` + - Enables the CDC flows to populate the target table. + * - :ref:`Expectations (optional) ` + - Enable expectations and specify the location of the expectations file(s). + * - :ref:`Quarantine Details (optional) ` + - Set the quarantine mode and if the mode is ``table`` the details of the quarantine table. + * - :ref:`Table Migration Details (optional) ` + - The details of the table being migrated from. + * - :ref:`Flow Groups ` + - Contains the flow groups for the dataflow. + + * A flow group can contain one or more flows. + * flows implements the :doc:`feature_multi_source_streaming` feature of DLT.' + + +Flow Groups Explained +~~~~~~~~~~~~~~~~~~~~~ + +A Flow Group is a logical grouping of related flows and staging tables, which ultimately allows you to define complex data flows with many sources, transformations and views, that can evolve over time. + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Component + - Description + * - :ref:`Staging Tables ` + - Allows you to define staging tables that can be used in any of the flows defined in the flow group. + * - :ref:`Flows ` + - A logical grouping of related flows. + +.. important:: + + The :doc:`Multi-Source Streaming ` feature allows you to stream multiple flows into a single target. + + Core to this functionality is the ability to add and remove Flow Groups and Flows therein, as your requirements and systems evolve. This will not break the existing pipeline and will not require a full refresh of the Pipeline. + +Staging Tables Explained +~~~~~~~~~~~~~~~~~~~~~~~~ + +Staging tables are used to store the data that is being transformed and loaded into the target table. + +Some key points to note: + + * Staging tables are optional. + * Staging tables can be referenced as a source or target in any of the flows defined in the flow group. + * In some cases for very large and complex data flows, you may want to decompose your dataflow into a smaller more manageable data flows. In this instance staging tables may in fact become target tables in smaller more manageable data flows. In these cases they can only be used as a source in downstream Pipelines. This however really depend on the design practices you choose to follow. + +When defining a staging table, you can specify the following: + + * The ``name`` of the staging table. + * The ``type`` of staging table. Currently only Streaming Tables are supported. + * Optional - The ``schemaPath`` of the staging table. + * Optional - The ``tableProperties`` of the staging table. + * Optional - The ``cdcSettings`` configuration of the staging table. + +Flows Explained +~~~~~~~~~~~~~~~ + +Flows are the building blocks of a Data Flow and they implement the :doc:`feature_multi_source_streaming` feature of DLT. + +Flows can be defined in one of two ways: + +1. **Source Table --> Target Table**: + + This is the most simple flow and requires no views. This is only possible when using the ``merge`` flow type. + +2. **View(s) --> Target Table**: + + This requires the definition of at least one view that is used as the source for the flow into the target table. You can also chain multiple views together where multiple transformation steps are required. + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Component + - Description + * - :ref:`Flow Type ` + - Allows you to define the type of flow. The Framework supports three types of flows: + + * **append_view** - Uses a source view to append data to a staging or target table. + * **append_sql** - Uses a raw SQL statement to append data to a staging or target table. + * **merge** - Uses the :ref:`CDC API's ` to merge data from a source view to a staging or target table. + + * - :ref:`Flow Details ` + - Defines the source and target of the flow and any additional properties required for the flow type. + * - :ref:`Views ` (optional) + - Views are used to define the source and any additional transformations for a flow. The different types of views are documented in the following sections: + + * :doc:`feature_source_target_types` + * :ref:`dataflow-spec-flows-view-configuration` + +.. important:: + + The :doc:`Multi-Source Streaming ` feature allows you to stream multiple flows into a single target. + + Core to this functionality is the ability to add and remove Flow Groups and Flows therein, as your requirements and systems evolve. This will not break the existing pipeline and will not require a full refresh of the Pipeline. + +.. _concepts_materialized_views: + +Materialized Views +------------------- + +Materialized Views are the precomputed results of a query stored in a Table. They are typically used for Gold scenarios, and where complex transformations are required. + +Data Flow Spec Components: + +.. code-block:: text + + Materialized Views + ├── Materialized View 1 + │ ├── Data Flow Metadata + │ ├── Source Details + │ ├── Table Details (optional) + │ ├── Data Quality Expectations (optional) + │ └──Quarantine Details (optional) + └── Materialized View n + ├── Data Flow Metadata + ├── Source Details + ├── Table Details (optional) + ├── Data Quality Expectations (optional) + └──Quarantine Details (optional) + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Component + - Description + * - :ref:`Data Flow Metadata ` + - Defines the metadata for the data flow. + * - :ref:`Source Details ` + - Specifies a source view, path to a SQL file or a raw SQL statement. + * - :ref:`Table Details (optional) ` + - Specifies any additional configuration for the target table, its configuration and properties. + * - :ref:`Data Quality Expectations (optional) ` + - Enable expectations and specify the location of the expectations file(s). + * - :ref:`Quarantine Details (optional) ` + - Set the quarantine mode and if the mode is ``table`` the details of the quarantine table. + +Patterns +-------- + +Detailed documentation on the different patterns that can be used to build out your data flow and Pipelines can be found in the section: :doc:`patterns` diff --git a/docs/source/contributor.rst b/docs/source/contributor.rst new file mode 100644 index 0000000..c2c906a --- /dev/null +++ b/docs/source/contributor.rst @@ -0,0 +1,11 @@ +Framework Development & Contributors +##################################### + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + contributor_dev_env + contributor_dev_git + contributor_dev_steps + contributor_dev_docs diff --git a/docs/source/contributor_dev_docs.rst b/docs/source/contributor_dev_docs.rst new file mode 100644 index 0000000..b4350b5 --- /dev/null +++ b/docs/source/contributor_dev_docs.rst @@ -0,0 +1,93 @@ +Updating the Documentation +########################## + +The documentation is written in `reStructuredText `_ format and is built using `Sphinx `_. + +Sphinx was chosen for it's ease of use and ability to: + +* easily generate a navigation bar and index. +* easily add cross references to other parts of the documentation. +* support for the more advanced documentation requirements for some of the pipeline pattern and feature documentation. + +Source Files +------------ + +The source files for the documentation are located in the ``docs/source`` directory. + +Writing Documentation +-------------------------- + +1. If a new feature is added or change to existing feature, ensure the feature is well documented in a new feature file or update the existing feature file with the name feature_.rst and add it to the :doc:`features` page. + - In the feature file, include: + - Feature description + - Configuration options + - Usage examples / sample code +2. Update Data Flow Spec reference where applicable + + +Styling +-------- +Styling for the documentation which controls the look of the html output is in two main locations + +1. The theme and options that are used to generate the html output is defined in the conf.py file. The existing theme should not be changed unless there is an agreement on a new theme. + + - More information on theming can be found here: https://www.sphinx-doc.org/en/master/usage/theming.html + - The available html output options can be found here: https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output +2. Custom styling can be found in the custom.css file which can be found in the docs/_static directory. + + - More information on how to add custom CSS can be found here: https://docs.readthedocs.com/platform/stable/guides/adding-custom-css.html#overriding-or-replacing-a-theme-s-stylesheet + +Generating the Documentation +---------------------------- + +Supported Formats +~~~~~~~~~~~~~~~~~ + +Sphinx has been configured to generate the following formats: + +* HTML +* Markdown + +Dependencies +~~~~~~~~~~~~ + +To install the dependencies, ensure that the following command dev step up is executed and if not run the following command in the root directory of the repository: + +.. code-block:: bash + + pip install -r requirements-dev.txt + +Building the Documentation +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The Framework comes with a make file to build the documentation. There is one for Bash and one for Windows. + +To build the documentation, run the following command in the docs directory: + +* HTML: + + .. code-block:: bash + + make html + +* Markdown: + + .. code-block:: bash + + make md + +The above commands will build the documentation and save it in the ``docs/build`` directory. + +To view the documentation, open the below files in your browser.: + +* HTML: ``docs/build/html/index.html`` +* Markdown: ``docs/build/markdown/index.md`` + +Clean the Documentation +~~~~~~~~~~~~~~~~~~~~~~~ + +Sometimes it will be necessary to delete the build directory and re-build the documentation. To do this, run the following command in the docs directory: + +.. code-block:: bash + + make clean diff --git a/docs/source/contributor_dev_env.rst b/docs/source/contributor_dev_env.rst new file mode 100644 index 0000000..78c008d --- /dev/null +++ b/docs/source/contributor_dev_env.rst @@ -0,0 +1,24 @@ +Development Environment Setup +############################# + +The sections below assumes the Lakeflow Framework repository has been cloned from git and you are in the root directory. If not please do so first. + +Setting up for development as a contributor to the Lakeflow Framework +================================================================ + +Once you have cloned the Lakeflow Framework repository, you'll need to follow the steps below to set up the framework. + +1. **Install requirements** + + Install the required dev dependencies from the root directory by running the following command (you may also want to use a virtual environment, venv for this by running ``python -m venv .venv/`` before running the command below to install dev requirements. See more `Python Virtual Environments `_): + ```bash + pip install -r requirements-dev.txt + ``` +2. **Set up VS Code extentions** + + Once you open the Lakeflow Framework workspace in VS Code for the first time, VS Code will prompt you to install the recommended extensions. + If you missed this prompt, you can review and install the recommended extensions with the Extensions: Show Recommended Extensions command or by clicking on the extentions tab on left side of the window and selecting "Workspace Recommendations". + +.. note:: + + To deploy the Lakeflow Framework to your Databricks workspace, follow the steps in :doc:`deploy_framework`. diff --git a/docs/source/contributor_dev_git.rst b/docs/source/contributor_dev_git.rst new file mode 100644 index 0000000..543bece --- /dev/null +++ b/docs/source/contributor_dev_git.rst @@ -0,0 +1,61 @@ +GIT +#### + +Branching Strategy +------------------ + +The project follows the Gitflow branching model for version control. This model provides a robust framework for managing larger projects with scheduled releases. + +Our repository maintains the following primary branches: + +* ``main`` - Contains production-ready code +* ``develop`` - Main integration branch for ongoing development +* ``release`` - Used when preparing a new production release +* ``feature`` - Short-lived branches for new feature development +* ``fix`` - Short-lived branches for bug fixes + +Feature Branch Guidelines +^^^^^^^^^^^^^^^^^^^^^^^^ + +* Create feature branches from ``develop`` +* Branch naming: ``feature/descriptive-name`` (e.g. ``feature/add-cdc-support``) +* Keep changes focused and atomic +* Regularly sync with ``develop`` to minimize merge conflicts +* Submit pull request to merge back into ``develop`` when complete + +Fix Branch Guidelines +^^^^^^^^^^^^^^^^^^^ + +* Create fix branches from ``develop`` for non-critical bugs +* Branch naming: ``fix/issue-description`` (e.g. ``fix/logging-format``) +* Include issue reference in commit messages when applicable +* Keep changes minimal and focused on the bug fix +* Submit pull request to merge back into ``develop`` when complete + +Release Strategy +--------------- + +The project follows a structured release process aligned with the GitFlow branching model: + +1. **Release Branch Creation** + + * When ``develop`` branch contains all features planned for release + * Create release branch: ``release/vX.Y.Z`` + * Branch naming follows semantic versioning (e.g. ``release/v1.2.0``) + +2. **Release Finalization** + + * After thorough testing and stabilization: + - Merge release branch into ``main`` + - Tag the release in ``main`` with version number + - Merge release branch back into ``develop`` + - Delete the release branch + +4. **Hotfix Process** + + * For critical production issues: + - Create hotfix branch from ``main`` (e.g. ``hotfix/v1.2.1``) + - Implement and test the fix + - Merge hotfix into both ``main`` and ``develop`` + - Tag the new version in ``main`` + diff --git a/docs/source/contributor_dev_steps.rst b/docs/source/contributor_dev_steps.rst new file mode 100644 index 0000000..e47293b --- /dev/null +++ b/docs/source/contributor_dev_steps.rst @@ -0,0 +1,87 @@ +Development Steps +################# + +This guide outlines the process for contributing features or fixes to the Lakeflow Framework. + +Issue Creation +-------------- +1. Create a new issue in the GitHub repository + + - Clearly describe the feature or bug + - Include acceptance criteria + - Add relevant labels (feature/bug/enhancement) + - Link to related issues if applicable + +Branch Management +----------------- +1. Create a feature branch from develop + + - Use naming convention: ``feature/[brief-description]`` + - Example: ``feature/add-scd2-support`` +2. Keep branches focused on single features/fixes +3. Regularly sync with develop to avoid merge conflicts + +Development Process +------------------- +1. Local Development + + - Follow coding standards and style guides + - Ensure the yapf extention is installed and enabled in VS Code (refer to step 2 of :doc:`contributor_dev_env`) + - Use yapf to format your python code (right click and select 'Format Document With' then select yapf) + - Stick to solid principles and object oriented design patterns + - Deploy updated framework to Databricks to ensure it is working as expected + - Use meaningful commit messages + - Keep commits atomic and focused + +2. Unit Testing + + - Write unit tests per :doc:`contributor_unit_test` + - Test both success and failure scenarios + - Ensure test coverage meets requirements + - Run existing test suite to check for regressions + +3. Integration Testing / Samples + + - Where applicable, add sample pipelines to bronze or silver to show how to use the new feature + - Deploy and run existing sample pipelines on Databricks to ensure changes are not breaking existing functionality (refer to :doc:`deploy_samples`) + +4. Documentation + - Update documentation per :doc:`contributor_dev_docs` + +Pull Request Process +-------------------- +1. PR Creation + + - Create PR from feature branch to develop + - Fill out PR template completely + - Link related issues + - Add relevant reviewers + +2. PR Review + + - Address reviewer comments + - Update code/docs as needed + - Get required approvals + +3. Merge Process + + - **Squash and merge** to develop + - Delete feature branch after merge + - Close related issues + +Post-Merge Steps +---------------- +1. Verify Changes + + - Confirm changes are working in develop + - Check documentation is published correctly + - Validate CI/CD pipeline passes + +2. Monitor + + - Watch for any issues in develop + - Be prepared to address any problems quickly + + + + diff --git a/docs/source/dataflow_spec_ref_cdc.rst b/docs/source/dataflow_spec_ref_cdc.rst new file mode 100644 index 0000000..a4520ca --- /dev/null +++ b/docs/source/dataflow_spec_ref_cdc.rst @@ -0,0 +1,163 @@ +Change Data Capture (CDC) Configuration +---------------------------------------- + +The ``cdcSettings`` and ``cdcSnapshotSettings`` enable and pass configuration info to the CDC API's. + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **cdcSettings** + - ``object`` + - See :ref:`cdcSettings` for more information. + * - **cdcSnapshotSettings** + - ``object`` + - See :ref:`cdcSnapshotSettings` for more information. + +cdcSettings +~~~~~~~~~~~~~~~~~ + +The ``cdcSettings`` object contains the following properties: + +.. list-table:: + :header-rows: 1 + + * - Parameter + - Type + - Description + * - **keys** + - ``list`` + - The column or combination of columns that uniquely identify a row in the source data. This is used to identify which CDC events apply to specific records in the target table. + * - **sequence_by** + - str + - The column name specifying the logical order of CDC events in the source data. Delta Live Tables uses this sequencing to handle change events that arrive out of order. + * - **scd_type** + - ``string`` + - Whether to store records as SCD type 1 or SCD type 2. Set to ``1`` for SCD type 1 or 2 for SCD type ``2``. + * - **apply_as_deletes** + - ``string`` + - (*optional*) Specifies when a CDC event should be treated as a DELETE rather than an upsert. + * - **where** + - ``string`` + - (*optional*) Filter the rows by a condition. + * - **ignore_null_updates** + - ``boolean`` + - (*optional*) Allow ingesting updates containing a subset of the target columns. When a CDC event matches an existing row and ignore_null_updates is True, columns with a null retain their existing values in the target. This also applies to nested columns with a value of null. When ignore_null_updates is False, existing values are overwritten with null values. + * - **except_column_list** + - ``list`` + - (*optional*) A list of columns to exclude from the upsert into the target table. + * - | **track_history_column_list** + | **track_history_except_column_list** + - ``list`` + - A subset of output columns to be tracked for history in the target table. Use track_history_column_list to specify the complete list of columns to be tracked. Use track_history_except_column_list to specify the columns to be excluded from tracking. + + +cdcSnapshotSettings +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``cdcSnapshotSettings`` object contains the following properties: + +.. list-table:: + :header-rows: 1 + + * - Parameter + - Type + - Description + * - **keys** + - ``list`` + - The column or combination of columns that uniquely identify a row in the source data. This is used to identify which CDC events apply to specific records in the target table. + * - **snapshotType** + - str + - The type of snapshot to process. Set to ``periodic`` for periodic snapshots or ``historical`` for historical snapshots (refer to :ref:`CDC Historical Snapshot Source Configuration` for which type to use). Note that ``historical`` snapshot types are not supported in ``flow`` data flow types. + * - **scd_type** + - ``string`` + - Whether to store records as SCD type 1 or SCD type 2. Set to ``1`` for SCD type 1 or 2 for SCD type ``2``. + * - **sourceType** + - ``string`` + - The type of source to ingest the snapshots from. Set to ``file`` for file based sources. + * - **source** + - ``object`` + - The source to ingest the snapshots from. This is required for ``historical`` snapshot types. See :ref:`cdc-apply-changes-from-snapshot-source` for more information. + * - **track_history_column_list** + - ``list`` + - (*optional*) A subset of output columns to be tracked for history in the target table. Use this to specify the complete list of columns to be tracked. This cannot be used in conjunction with ``track_history_except_column_list``. + * - **track_history_except_column_list** + - ``list`` + - (*optional*) A subset of output columns to be excluded from history tracking in the target table. Use this to specify which columns should not be tracked. This cannot be used in conjunction with ``track_history_column_list``. + +.. _cdc-apply-changes-from-snapshot-source: + +CDC Historical Snapshot Source Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + The ``source`` object contains the following properties for ``file`` based sources: + + .. list-table:: + :header-rows: 1 + + * - Parameter + - Type + - Description + * - **format** + - ``string`` + - The format of the source data. E.g. supported formats are ``table``, ``parquet``, ``csv``, ``json``. All formats supported by spark see `PySpark Data Sources API `_. + * - **path** + - ``string`` + - The location to load the source data from. This can be a table name or a path to a a file or directory with multiple snapshots. A placeholder ``{version}`` can be used in this path which will be substituted with the version value in run time. + * - **versionType** + - ``string`` + - The type of versioning to use. Can be either ``int`` or ``datetime``. + * - **datetimeFormat** + - ``string`` + - (*conditional*) Required if ``versionType`` is ``datetime``. The format of ``startingVersion`` datetime value. + * - **microSecondMaskLength** + - ``integer`` + - (*optional*) WARNING: Edge Cases Only! + - Specify this if your ``versionType`` is ``datetime`` and your filename includes microsends, but not the full 6 digits. The number of microsecond digits to included at the end of the datetime value. + - The default value is 6. + * - **startingVersion** + - ``string`` or ``integer`` + - (*optional*) The version to start processing from. + * - **readerOptions** + - ``object`` + - (*optional*) Additional options to pass to the reader. + * - **schemaPath** + - ``string`` + - (*optional*) The schema path to use for the source data. + * - **selectExp** + - ``list`` + - (*optional*) A list of select expressions to apply to the source data. + * - **filter** + - ``string`` + - (*optional*) A filter expression to apply to the source data. This filter is applied to the dataframe as a WHERE clause when the source is read. A placeholder ``{version}`` can be used in this filter expression which will be substituted with the version value in run time. + * - **recursiveFileLookup** + - ``boolean`` + - (*optional*) When set to ``true``, enables recursive directory traversal to find snapshot files. This should be used when snapshots are stored in a nested directory structure such as Hive-style partitioning (e.g., ``/data/{version}/file.parquet``). When set to ``false`` (default), only files in the immediate directory are searched. Default: ``false``. + + + .. note:: + If ``recursiveFileLookup`` is set to ``true``, ensure that the ``path`` parameter is specified in a way that is compatible with recursive directory traversal. I.e. the ``{version}`` placeholder is used in the path and not the filename. + + The ``source`` object contains the following properties for ``table`` based sources: + + .. list-table:: + :header-rows: 1 + + * - Parameter + - Type + - Description + * - **table** + - ``string`` + - The table name to load the source data from. + * - **versionColumn** + - ``string`` + - The column name to use for versioning. + * - **startingVersion** + - ``string`` or ``integer`` + - (*optional*) The version to start processing from. + * - **selectExp** + - ``list`` + - (*optional*) A list of select expressions to apply to the source data. diff --git a/docs/source/dataflow_spec_ref_data_quality.rst b/docs/source/dataflow_spec_ref_data_quality.rst new file mode 100644 index 0000000..42585d2 --- /dev/null +++ b/docs/source/dataflow_spec_ref_data_quality.rst @@ -0,0 +1,60 @@ +Data Quality and Quarantine Configuration +------------------------------------------- + +These properties control how data quality issues are handled: + +.. list-table:: + :header-rows: 1 + :widths: 50 10 40 + + * - **Field** + - **Type** + - **Description** + * - **dataQualityExpectationsEnabled** (*optional*) + - ``boolean`` + - A flag indicating whether data quality expectations are enabled (see :doc:`feature_data_quality_quarantine`). + * - **dataQualityExpectationsPath** (*optional*) + - ``string`` + - Either a relative path or filename for the expectations file. Note that the framework automatically calculates all relative paths from the appropriate expectations sub-folder, in the Pipeline Bundle. Examples: + + * All expectations files in the ``expectations`` sub-folder: ``.`` or ``*`` + * A specific expectations file: ``my_table_dqe.json`` + + * - **quarantineMode** (*optional*) + - ``string`` + - The mode for handling quarantined data. It can be `off`, `flag`, or `table`. + Supported: `["off", "flag", "table"]` + * - **quarantineTargetDetails** (*optional*) + - ``object`` + - Details about the quarantine target, only required if ``quarantineMode`` is set to ``table``. + See :ref:`quarantine-target-details` section below. + +.. _quarantine-target-details: + +quarantineTargetDetails +~~~~~~~~~~~~~~~~~~~~~~~ + +The `quarantineTargetDetails` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Parameter + - Type + - Description + * - targetFormat + - ``string`` + - The format of the quarantine target. Currently, only ``delta`` is supported. + + | Supported: ``["delta"]`` + | Default: ``"delta"`` + * - table + - ``string`` + - (*conditional*) The table name, required if ``targetFormat`` is ``delta``. + * - tableProperties + - ``object`` + - (*conditional*) Additional properties for the table, required if ``targetFormat`` is ``delta``. + * - path + - ``string`` + - (*conditional*) The path to the table, required if ``targetFormat`` is ``delta``. \ No newline at end of file diff --git a/docs/source/dataflow_spec_ref_main_flows.rst b/docs/source/dataflow_spec_ref_main_flows.rst new file mode 100644 index 0000000..d7ba1f1 --- /dev/null +++ b/docs/source/dataflow_spec_ref_main_flows.rst @@ -0,0 +1,427 @@ +Creating a Flows Data Flow Spec Reference +############################################# + +A standard Data Flow Spec is the most basic type of Data Flow Spec and is suited to basic use cases where you are performing 1:1 ingestion or loads. It is particularly suited to Bronze Ingestion Use Cases. + +Example: +-------- + +The below sample demonstrates a flows Data Flow Spec for a Silver multi-source streaming use case (refer to :doc:`patterns_streaming_multi_source_streaming` for more information): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "etp5stg", + "dataFlowGroup": "etp5", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "staging_table_mrg_p5", + "schemaPath": "", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "partitionColumns": [] + }, + "cdcSettings": { + "keys": [ + "CONTRACT_ID" + ], + "sequence_by": "EXTRACT_DTTM", + "where": "", + "ignore_null_updates": true, + "except_column_list": [ + "__START_AT", + "__END_AT" + ], + "scd_type": "2", + "track_history_column_list": [], + "track_history_except_column_list": [] + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "off", + "quarantineTargetDetails": {}, + "flowGroups": [ + { + "flowGroupId": "et1", + "stagingTables": { + "staging_table_apnd_p5": { + "type": "ST", + "schemaPath": "" + } + }, + "flows": { + "f_contract": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_p5", + "sourceView": "v_brz_contract" + }, + "views": { + "v_brz_contract": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_test_4", + "table": "contract", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_loan": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_p5", + "sourceView": "v_brz_loan" + }, + "views": { + "v_brz_loan": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_test_4", + "table": "loan", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "staging_table_mrg_p5", + "sourceView": "staging_table_apnd_p5" + } + } + } + } + ] + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: etp5stg + dataFlowGroup: etp5 + dataFlowType: flow + targetFormat: delta + targetDetails: + table: staging_table_mrg_p5 + schemaPath: '' + tableProperties: + delta.enableChangeDataFeed: 'true' + partitionColumns: [] + cdcSettings: + keys: + - CONTRACT_ID + sequence_by: EXTRACT_DTTM + where: '' + ignore_null_updates: true + except_column_list: + - __START_AT + - __END_AT + scd_type: '2' + track_history_column_list: [] + track_history_except_column_list: [] + dataQualityExpectationsEnabled: false + quarantineMode: 'off' + quarantineTargetDetails: {} + flowGroups: + - flowGroupId: et1 + stagingTables: + staging_table_apnd_p5: + type: ST + schemaPath: '' + flows: + f_contract: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_p5 + sourceView: v_brz_contract + views: + v_brz_contract: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_test_4 + table: contract + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_loan: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_p5 + sourceView: v_brz_loan + views: + v_brz_loan: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_test_4 + table: loan + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_merge: + flowType: merge + flowDetails: + targetTable: staging_table_mrg_p5 + sourceView: staging_table_apnd_p5 + +The above dataflow spec sample contains the following core components: + + * Dataflow metadata configuration + * Target configuration + * Data quality and quarantine settings + * Flow group configuration + +The following sections detail each of the above components. + +.. _dataflow-spec-flows-metadata-configuration: + +Dataflow Metadata Configuration +------------------------------- + +These properties define the basic identity and type of the dataflow: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **dataFlowId** + - ``string`` + - A unique identifier for the data flow. + * - **dataFlowGroup** + - ``string`` + - The group to which the data flow belongs, can be the same as `dataFlowId` if there is no group. + * - **dataFlowType** + - ``string`` + - The type of data flow. It can be either `flow` or `standard`. + Supported: ``flow``, ``standard`` + +.. _dataflow-spec-flows-target-configuration: + +Target Configuration +--------------------- + +These properties define where and how the data will be written: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **mode** + - ``string`` + - The mode of the data flow. + Supported: ``stream``, ``batch`` + * - **targetFormat** + - ``string`` + - The format of the target data. + If the format is `delta`, additional `targetDetails` must be provided. + * - **targetDetails** + - ``object`` + - See :doc:`dataflow_spec_ref_target_details`. + +.. _dataflow-spec-flows-flow-groups-configuration: + +Flow Group Configuration +------------------------ + +The `flowGroupDetails` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **dataFlowID** (*optional*) + - ``string`` + - A unique identifier for the data flow. Only required when dataflow specs are split (see :doc:`splitting_dataflow_spec`). + * - **flowGroupId** + - ``string`` + - A unique identifier for the flow group. + * - **stagingTables** (*optional*) + - ``object`` + - An object containing named objects representing staging tables in the flow group. The key for each nested object in this object will become the table names for the staging tables. + * - **flows** + - ``array`` + - An array of flows in the flow group. Items: :ref:`flow-configuration` + + +.. _dataflow-spec-flows-staging-table-configuration: + +Staging Table Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `stagingTableDetails` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **type** + - ``string`` + - The type of the staging table can be either a Streaming Table or Materialized View. Supported: ``ST``, ``MV`` + * - **schemaPath** (*optional*) + - ``string`` + - The schema path of the staging table. + * - **partitionColumns** (*optional*) + - ``array`` + - An array of partition columns for the staging table. Items: ``string`` + * - **cdcSettings** (*optional*) + - ``object`` + - Change data capture (CDC) settings. Object: :doc:`dataflow_spec_ref_cdc` + +.. admonition:: Recommendation + :class: tip + + It is recommended that you avoid specifying a schema path for staging tables, in order to reduce maintenance overhead and to take advantage of schema evolution. + +.. _dataflow-spec-flows-flow-configuration: + +Flow Configuration +~~~~~~~~~~~~~~~~~~ + +A `flow` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **enabled** + - ``boolean`` + - A flag indicating whether the flow is enabled. + * - **flowType** + - ``string`` + - The type of the flow. + Supported: ``append_view``, ``append_sql``, ``merge`` + * - **flowDetails** + - ``object`` + - Details about the flow, required based on `flowType`. + Properties vary based on `flowType`. See :ref:`Flow Details`. + * - **views** (*optional*) + - ``object`` + - An object containing views used in the flow. The key for each nested object in this object will become the view names. + +.. _dataflow-spec-flows-flow-details: + +Flow Details +~~~~~~~~~~~~~ + +The `flowDetails` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Flow Type** + - **Property** + - **Type** + - **Description** + * - **append_sql** + - **targetTable** + - ``string`` + - The target table for the SQL append flow. + * - + - **sqlPath** + - ``string`` + - The path to the SQL file for the append flow. + * - **append_view** + - **targetTable** + - ``string`` + - The target table for the view append flow. + * - + - **sourceView** + - ``string`` + - The source view for the append flow. + * - + - **column_prefix** (*optional*) + - ``string`` + - The prefix for columns in the target table. + * - + - **column_prefix_exceptions** (*optional*) + - ``array`` + - An array of columns that are exceptions to the prefix rule. + * - **merge** + - **targetTable** + - ``string`` + - The target table for the merge flow. + * - + - **sourceView** + - ``string`` + - The source view for the merge flow. + +.. _dataflow-spec-flows-view-configuration: + +View Configuration +~~~~~~~~~~~~~~~~~~ + +The `viewDetails` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **mode** + - ``string`` + - The mode of the view, either `batch` or `stream`. + Supported: ``batch``, ``stream`` + * - **sourceType** + - ``string`` + - The type of the source. + Supported: ``cloudFiles``, ``delta``, ``deltaJoin``, ``sql`` + * - **columnsToUpdate** (*optional*) + - ``array`` + - An array of columns to update. + Items: ``string`` + * - **sourceDetails** (*conditional*) + - ``object`` + - See :doc:`dataflow_spec_ref_source_details`. + + + +.. _dataflow-spec-flows-cdc-configuration: + +.. include:: dataflow_spec_ref_cdc.rst + +.. _dataflow-spec-flows-data-quality-configuration: + +.. include:: dataflow_spec_ref_data_quality.rst + +.. _dataflow-spec-flows-table-migration-configuration: + +.. include:: dataflow_spec_ref_table_migration.rst diff --git a/docs/source/dataflow_spec_ref_main_materialized_views.rst b/docs/source/dataflow_spec_ref_main_materialized_views.rst new file mode 100644 index 0000000..a8e9f18 --- /dev/null +++ b/docs/source/dataflow_spec_ref_main_materialized_views.rst @@ -0,0 +1,274 @@ +Creating a Materialized View Data Flow Spec Reference +#################################################### + +A Materialized View Data Flow Spec is designed for creating and maintaining materialized views that aggregate or transform data from source tables. + + +Schema +------ + +The following schema details the configuration for a Materialized View Data Flow Spec: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "feature_materialized_views", + "dataFlowGroup": "feature_samples", + "dataFlowType": "materialized_view", + "materializedViews": { + "mv_name": { + "sourceView": { + "sourceViewName": "", + "sourceType": "[delta|python|sql]", + "sourceDetails": {} + }, + "sqlPath": "", + "sqlStatement": "", + "tableDetails": { + "database": "", + "schemaPath": "", + "tableProperties": {}, + "path": "", + "partitionColumns": [], + "clusterByColumns": [] + }, + "dataQualityExpectationsEnabled": false, + "dataQualityExpectationsPath": "", + "quarantineMode": "off", + "quarantineTargetDetails": {} + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: feature_materialized_views + dataFlowGroup: feature_samples + dataFlowType: materialized_view + materializedViews: + mv_name: + sourceView: + sourceViewName: '' + sourceType: '[delta|python|sql]' + sourceDetails: {} + sqlPath: '' + sqlStatement: '' + tableDetails: + database: '' + schemaPath: '' + tableProperties: {} + path: '' + partitionColumns: [] + clusterByColumns: [] + dataQualityExpectationsEnabled: false + dataQualityExpectationsPath: '' + quarantineMode: 'off' + quarantineTargetDetails: {} + +Example: +-------- + +The below demonstrates a Materialized View Data Flow Spec: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "feature_materialized_views", + "dataFlowGroup": "feature_samples", + "dataFlowType": "materialized_view", + "materializedViews": { + "mv_from_source_view": { + "sourceView": { + "sourceViewName": "v_mv_source_view", + "sourceType": "delta", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + } + }, + "tableDetails": { + "database": "{gold_schema}", + "tableProperties": { + "delta.autoOptimize.optimizeWrite": "true", + "delta.autoOptimize.autoCompact": "true" + }, + "clusterByColumns": ["year", "month"], + "schemaPath": "schemas/customer_metrics_mv.json" + }, + }, + "mv_from_sql_path": { + "sqlPath": "./mv_from_sql_path.sql" + }, + "mv_from_sql_statement": { + "sqlStatement": "SELECT * FROM {staging_schema}.customer" + }, + "mv_with_quarantine": { + "sqlStatement": "SELECT * FROM {staging_schema}.customer_address", + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta" + } + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: feature_materialized_views + dataFlowGroup: feature_samples + dataFlowType: materialized_view + materializedViews: + mv_from_source_view: + sourceView: + sourceViewName: v_mv_source_view + sourceType: delta + sourceDetails: + database: '{staging_schema}' + table: customer + cdfEnabled: true + tableDetails: + database: '{gold_schema}' + tableProperties: + delta.autoOptimize.optimizeWrite: 'true' + delta.autoOptimize.autoCompact: 'true' + clusterByColumns: + - year + - month + schemaPath: schemas/customer_metrics_mv.json + mv_from_sql_path: + sqlPath: ./mv_from_sql_path.sql + mv_from_sql_statement: + sqlStatement: SELECT * FROM {staging_schema}.customer + mv_with_quarantine: + sqlStatement: SELECT * FROM {staging_schema}.customer_address + dataQualityExpectationsEnabled: true + dataQualityExpectationsPath: ./customer_address_dqe.json + quarantineMode: table + quarantineTargetDetails: + targetFormat: delta + +The above dataflow spec sample contains the following core components: + + * Dataflow metadata configuration + * Source configuration + * Table configuration + * Data quality and quarantine settings + +The following sections detail each of the above components. + +.. _dataflow-spec-materialized-view-metadata-configuration: + +Dataflow Metadata Configuration +------------------------------- + +These properties define the basic identity and type of the dataflow: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **dataFlowId** + - ``string`` + - A unique identifier for the data flow. + * - **dataFlowGroup** + - ``string`` + - The group to which the data flow belongs, can be the same as `dataFlowId` if there is no group. + * - **dataFlowType** + - ``string`` + - The type of data flow. Must be `materialized_view` for materialized view dataflows. + +.. _dataflow-spec-materialized-view-source-configuration: + +Source Configuration +--------------------- + +These properties define the source of the data: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **sourceSystem** (*optional*) + - ``string`` + - The source system name. Value is not used to determine or change any behaviour. + * - **sourceType** + - ``string`` + - The type of source. + Supported: ``cloudFiles``, ``delta``, ``sql``, ``python`` + * - **sourceViewName** + - ``string`` + - The name to assign the source view. + String Pattern: `v_([A-Za-z0-9_]+)` + * - **sourceDetails** + - ``object`` + - See :doc:`dataflow_spec_ref_source_details` for more information. + +.. _dataflow-spec-materialized-view-table-configuration: + +Table Configuration +------------------------------ + +These properties define the materialized view specific configuration: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **tableDetails** + - ``object`` + - Configuration specific to materialized views. + * - **database** + - ``string`` + - The schema to write the materialized view to. + * - **schemaPath** + - ``string`` + - The path to the schema file for the materialized view. + * - **tableProperties** + - ``object`` + - The table properties to set on the materialized view. + * - **path** + - ``string`` + - A storage location for table data. If not set, use the managed storage location for the schema containing the table. + * - **partitionColumns** + - ``array`` + - The columns to partition the materialized view by. + * - **clusterByColumns** + - ``array`` + - The suggested columns to cluster the materialized view by. + * - **comment** + - ``string`` + - A description for the materialized view. + * - **spark_conf** (*optional*) + - ``object`` + - A list of Spark configurations for the execution of this query. + * - **private** (*optional*) + - ``boolean`` + - Create a table, but do not publish the table to the metastore. + +.. _dataflow-spec-materialized-view-data-quality-configuration: + +.. include:: dataflow_spec_ref_data_quality.rst diff --git a/docs/source/dataflow_spec_ref_main_standard.rst b/docs/source/dataflow_spec_ref_main_standard.rst new file mode 100644 index 0000000..c5272e6 --- /dev/null +++ b/docs/source/dataflow_spec_ref_main_standard.rst @@ -0,0 +1,201 @@ +Creating a Standard Data Flow Spec Reference +############################################# + +A standard Data Flow Spec is the most basic type of Data Flow Spec and is suited to basic use cases where you are performing 1:1 ingestion or loads. It is particularly suited to Bronze Ingestion Use Cases. + +Example: +-------- + +The below demonstrates a standard Data Flow Spec for a Bronze ingestion use case (refer to :doc:`patterns_streaming_basic_1_to_1` for more information): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "crm_1", + "dataFlowGroup": "crm", + "dataFlowType": "standard", + "sourceType": "delta", + "sourceSystem": "crm", + "sourceViewName": "v_customer_address", + "sourceDetails": { + "database": "source_db", + "table": "customer_address", + "cdfEnabled": true, + "schemaPath": "schemas/customer_address.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_address", + "tableProperties": { + "delta.autoOptimize.optimizeWrite": "true", + "delta.autoOptimize.autoCompact": "true" + }, + "partitionColumns": ["country_code"], + "schemaPath": "schemas/customer_address.json" + }, + "dataQualityExpectationsEnabled": true, + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta", + "table": "customer_address_quarantine", + "tableProperties": {} + }, + "cdcSettings": { + "keys": ["address_id"], + "sequence_by": "updated_timestamp", + "scd_type": "2", + "where": "", + "ignore_null_updates": true, + "except_column_list": ["updated_timestamp"], + "apply_as_deletes": "DELETE_FLAG = True" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: crm_1 + dataFlowGroup: crm + dataFlowType: standard + sourceType: delta + sourceSystem: crm + sourceViewName: v_customer_address + sourceDetails: + database: source_db + table: customer_address + cdfEnabled: true + schemaPath: schemas/customer_address.json + mode: stream + targetFormat: delta + targetDetails: + table: customer_address + tableProperties: + delta.autoOptimize.optimizeWrite: 'true' + delta.autoOptimize.autoCompact: 'true' + partitionColumns: + - country_code + schemaPath: schemas/customer_address.json + dataQualityExpectationsEnabled: true + quarantineMode: table + quarantineTargetDetails: + targetFormat: delta + table: customer_address_quarantine + tableProperties: {} + cdcSettings: + keys: + - address_id + sequence_by: updated_timestamp + scd_type: '2' + where: '' + ignore_null_updates: true + except_column_list: + - updated_timestamp + apply_as_deletes: DELETE_FLAG = True + +The above dataflow spec sample contains the following core components: + + * Dataflow metadata configuration + * Source configuration + * Target configuration + * Data quality and quarantine settings + * CDC (SCD2) configuration + +The following sections detail each of the above components. + +.. _dataflow-spec-standard-metadata-configuration: + +Dataflow Metadata Configuration +------------------------------- + +These properties define the basic identity and type of the dataflow: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **dataFlowId** + - ``string`` + - A unique identifier for the data flow. + * - **dataFlowGroup** + - ``string`` + - The group to which the data flow belongs, can be the same as `dataFlowId` if there is no group. + * - **dataFlowType** + - ``string`` + - The type of data flow. It can be either `flow` or `standard`. + Supported: `["flow", "standard"]` + +.. _dataflow-spec-standard-source-configuration: + +Source Configuration +--------------------- + +These properties define the source of the data: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **sourceSystem** (*optional*) + - ``string`` + - The source system name. Value is not used to determine or change any behaviour, required if `dataFlowType` is `standard`. + * - **sourceType** + - ``string`` + - The type of source, required if `dataFlowType` is `standard`. + Supported: ``cloudFiles``, ``delta``, ``deltaJoin``, ``kafka`` + * - **sourceViewName** + - ``string`` + - The name to assign the source view, required if `dataFlowType` is `standard`. + String Pattern: `v_([A-Za-z0-9_]+)` + * - **sourceDetails** + - ``object`` + - See :doc:`dataflow_spec_ref_source_details` for more information. + +.. _dataflow-spec-standard-target-configuration: + +Target Configuration +--------------------- + +These properties define where and how the data will be written: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **mode** + - ``string`` + - The mode of the data flow. + Supported: ``["stream", "batch"]`` + * - **targetFormat** + - ``string`` + - The format of the target data. + If the format is `delta`, additional `targetDetails` must be provided. + * - **targetDetails** + - ``object`` + - See :doc:`dataflow_spec_ref_target_details`. + +.. _dataflow-spec-standard-cdc-configuration: + +.. include:: dataflow_spec_ref_cdc.rst + +.. _dataflow-spec-standard-data-quality-configuration: + +.. include:: dataflow_spec_ref_data_quality.rst + +.. _dataflow-spec-standard-table-migration-configuration: + +.. include:: dataflow_spec_ref_table_migration.rst \ No newline at end of file diff --git a/docs/source/dataflow_spec_ref_source_details.rst b/docs/source/dataflow_spec_ref_source_details.rst new file mode 100644 index 0000000..5b87dfc --- /dev/null +++ b/docs/source/dataflow_spec_ref_source_details.rst @@ -0,0 +1,307 @@ +Data Flow Spec - Source Details +############################## + +The `sourceDetails` object can be any of the following, based on the `sourceType`: + +Batch Files +---------------- + +The `sourceBatchFiles` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **format** + - ``string`` + - The format of the batch files. Supported: `["csv", "json", "parquet", "text", "xml"]` + * - **path** + - ``string`` + - The path to the batch files. + * - **readerOptions** + - ``object`` + - Options for reading the batch files. See `definitions_sources.json` schema for supported options. + * - **selectExp** (*optional*) + - ``array`` + - An array of select expressions. Items: ``string`` + * - **whereClause** (*optional*) + - ``array`` + - An array of where clauses. Items: ``string`` + * - **schemaPath** (*optional*) + - ``string`` + - The schema path. + * - **pythonTransform** (*optional*) + - ``string`` + - The Python transform configuration. See :ref:`pythonTransform-object` for supported options. + +Cloud Files +---------------- + +The `sourceCloudFiles` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **path** + - ``string`` + - The path to the cloud files. + * - **readerOptions** + - ``object`` + - Options for reading the cloud files. See `definitions_sources.json` schema for supported options. + * - **selectExp** (*optional*) + - ``array`` + - An array of select expressions. Items: ``string`` + * - **whereClause** (*optional*) + - ``array`` + - An array of where clauses. Items: ``string`` + * - **schemaPath** (*optional*) + - ``string`` + - The schema path. + * - **pythonTransform** (*optional*) + - ``string`` + - The Python transform configuration. See :ref:`pythonTransform-object` for supported options. + +Delta +---------------- + +The `sourceDelta` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **database** + - ``string`` + - The database name. + * - **table** + - ``string`` + - The table name. + * - **cdfEnabled** + - ``boolean`` + - Whether change data feed (CDF) is enabled. + * - **tablePath** (*optional*) + - ``string`` + - The table path. + * - **selectExp** (*optional*) + - ``array`` + - An array of select expressions. Items: ``string`` + * - **whereClause** (*optional*) + - ``array`` + - An array of where clauses. Items: ``string`` + * - **schemaPath** (*optional*) + - ``string`` + - The schema path. + * - **readerOptions** (*optional*) + - ``object`` + - Additional reader options. See `definitions_sources.json` schema for supported options. + * - **pythonTransform** (*optional*) + - ``string`` + - The Python transform configuration. See :ref:`pythonTransform-object` for supported options. + * - **startingVersionFromDLTSetup** (*optional*) + - ``boolean`` + - Whether to automatically set reader option 'startingVersion' to the last time the SDP Setup operation was run on the source table. This helps to ensure CDF is read from the last time source table was reset (full refresh). + +Delta Join +---------------- + +The `sourceDeltaJoin` object contains the following properties: + +**Sources:** + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **database** + - ``string`` + - The database name. + * - **table** + - ``string`` + - The table name. + * - **alias** + - ``string`` + - The alias for the table. + * - **joinMode** + - ``string`` + - The join mode. Supported: `["stream", "static"]`, Default: `"stream"` + * - **cdfEnabled** + - ``boolean`` + - Whether change data feed (CDF) is enabled. + * - **tablePath** (*optional*) + - ``string`` + - The table path. + * - **selectExp** (*optional*) + - ``array`` + - An array of select expressions. Items: ``string`` + * - **whereClause** (*optional*) + - ``array`` + - An array of where clauses. Items: ``string`` + * - **schemaPath** (*optional*) + - ``string`` + - The schema path. + * - **readerOptions** (*optional*) + - ``object`` + - Additional reader options. See `definitions_sources.json` schema for supported options. + * - **pythonTransform** (*optional*) + - ``string`` + - The Python transform configuration. See :ref:`pythonTransform-object` for supported options. + +**Joins:** + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **joinType** + - ``string`` + - The join type. Supported: `["left", "inner"]`, Default: `"left"` + * - **condition** + - ``string`` + - The join condition. + +**Additional Properties:** + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **selectExp** (*optional*) + - ``array`` + - An array of select expressions. Items: ``string`` + * - **whereClause** (*optional*) + - ``array`` + - An array of where clauses. Items: ``string`` + * - **pythonTransform** (*optional*) + - ``string`` + - The Python transform configuration. See :ref:`pythonTransform-object` for supported options. + +Kafka +---------------- + +The `sourceKafkaReader` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **readerOptions** + - ``object`` + - Options for reading from Kafka. See `definitions_sources.json` schema for supported options. + * - **selectExp** (*optional*) + - ``array`` + - An array of select expressions. Items: ``string`` + * - **whereClause** (*optional*) + - ``array`` + - An array of where clauses. Items: ``string`` + * - **schemaPath** (*optional*) + - ``string`` + - The schema path. + * - **pythonTransform** (*optional*) + - ``string`` + - The Python transform configuration. See :ref:`pythonTransform-object` for supported options. + +Kafka SQL +---------------- + +In progress + +Python +---------------- + +The `sourcePython` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **functionPath** (*optional*) + - ``string`` + - The path to the Python file, which should live in the `python_functions` subdirectory. + * - **pythonModule** (*optional*) + - ``string`` + - The module to import the Python function from. + * - **tokens** + - ``object`` + - A dictionary of tokens that will be passed to the Python function. This allows you to pass in substitution variables from the data flow spec. + +.. important:: + + - You must select one of `functionPath` or `pythonModule`. + +SQL +---------------- + +The `sourceSql` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **sqlPath** (*optional*) + - ``string`` + - The path to the SQL file, which should live in the `dml` subdirectory. + * - **sqlStatement** (*optional*) + - ``string`` + - The SQL statement to execute. + +.. important:: + + - While the `sqlPath` and `sqlStatement` properties are optional you must select one. + - If both `sqlPath` and `sqlStatement` are provided, `sqlStatement` will take precedence. + + +.. _pythonTransform-object: + +Python Transform Object +----------------------- + +The `pythonTransform` object can be used to specify a Python transform function to be applied to the dataframe post read. It can contain the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **functionPath** (*optional*) + - ``string`` + - The path to the Python file, which should live in the `python_functions` subdirectory. + * - **module** (*optional*) + - ``string`` + - The module to import the Python function from. + * - **tokens** (*optional*) + - ``object`` + - A dictionary of tokens that will be passed to the Python function. This allows you to pass in substitution variables from the data flow spec. + +.. important:: + + - You must select one of `functionPath` or `module`. diff --git a/docs/source/dataflow_spec_ref_table_migration.rst b/docs/source/dataflow_spec_ref_table_migration.rst new file mode 100644 index 0000000..bbef737 --- /dev/null +++ b/docs/source/dataflow_spec_ref_table_migration.rst @@ -0,0 +1,80 @@ +Table Migration Configuration +----------------------------- + +These properties control table migration: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **tableMigrationDetails** (*optional*) + - ``object`` + - Details about table migration, only required if a table migration is needed. + See :ref:`Table Migration Details` section below. + +.. _table-migration-details: + +tableMigrationDetails +~~~~~~~~~~~~~~~~~~~~~ + +The `tableMigrationDetails` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Property** + - **Type** + - **Description** + * - **enabled** + - ``boolean`` + - A flag indicating whether table migration is enabled. + * - **catalogType** + - ``string`` + - The type of catalog, either `hms` or `uc`. + Supported values: `["hms", "uc"]` + * - **autoStartingVersionsEnabled** (*optional*) + - ``boolean`` + - Flag to enable automatic starting version management. When enabled, the system automatically tracks source table versions and manages starting versions for views. Defaults to ``true``. + * - **sourceDetails** + - ``object`` + - Details about the source for migration. + See :ref:`source-migrate-delta`. + +.. _source-migrate-delta: + +sourceDetails +^^^^^^^^^^^^^ + +The `sourceDetails` object can potentially cater to different types of sources but is currently limited to the following: + +* `sourceMigrateDelta` + +**sourceMigrateDelta** + +The ``sourceMigrateDelta`` object contains the following properties: + +.. list-table:: + :header-rows: 1 + + * - Field + - Type + - Description + * - **database** + - ``string`` + - The database name. + * - **table** + - ``string`` + - The table name. + * - **selectExp** (*optional*) + - ``array`` (items: ``string``) + - An array of select expressions. + * - **whereClause** (*optional*) + - ``array`` (items: ``string``) + - An array of where clauses. + * - **exceptColumns** (*optional*) + - ``array`` (items: ``string``) + - An array of columns to exclude. diff --git a/docs/source/dataflow_spec_ref_target_details.rst b/docs/source/dataflow_spec_ref_target_details.rst new file mode 100644 index 0000000..819b099 --- /dev/null +++ b/docs/source/dataflow_spec_ref_target_details.rst @@ -0,0 +1,283 @@ +Target Details Reference +####################### + +The target details object specifies how and where data should be written in your dataflow. This section documents the configuration options available for different target formats. + +.. _dataflow_spec_ref_target_details_delta: + +Delta Target Details +------------------- + +When using Delta format as your target, the following properties are available: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **table** + - ``string`` + - The name of the target table. + * - **database** (*optional*) + - ``string`` + - The database name for the target table. If not specified, the default database will be used. + * - **tableProperties** (*optional*) + - ``object`` + - A map of Delta table properties to set on the target table. Common properties include: + + - ``delta.autoOptimize.optimizeWrite`` + - ``delta.autoOptimize.autoCompact`` + - ``delta.enableChangeDataFeed`` + * - **partitionColumns** (*optional*) + - ``array[string]`` + - List of columns to partition the table by. + * - **clusterBy** (*optional*) + - ``array[string]`` + - List of columns to cluster the table by. + * - **clusterByAuto** (*optional*) + - ``boolean`` + - When true, the clustering keys will be automatically selected based on the data in the table. + * - **schemaPath** (*optional*) + - ``string`` + - Path to a schema file that defines the expected structure of the target table. + * - **mergeSchema** (*optional*) + - ``boolean`` + - When true, allows the schema to be updated when new columns are present in the source. + * - **overwriteSchema** (*optional*) + - ``boolean`` + - When true, allows complete replacement of the existing schema with a new one. + * - **comment** + - ``string`` + - A description for the materialized view. + * - **spark_conf** (*optional*) + - ``object`` + - A list of Spark configurations for the execution of this query. + +.. _dataflow_spec_ref_target_details_kafka: + +Delta Sink Target Details +------------------- + +When using a Delta Sink as a target, the following properties are available: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **name** + - ``string`` + - The name of the Delta Sink. + * - **sinkOptions** + - ``object`` + - The options for the Delta Sink. + +Delta sinkOptions +~~~~~~~~~~~~~~~~~ + +Please refer to the Databricks documentation for the most up to date information on the Delta Sink: https://docs.databricks.com/en/dlt/dlt-sinks + +You must specify one of the below properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **tableName** + - ``string`` + - The fully qualified name of the Delta table to write to. Three level namespace for UC e.g. `catalog_name.schema_name.table_name` + * - **path** + - ``string`` + - The path to the Delta table to write to e.g. `/Volumes/catalog_name/schema_name/volume_name/path/to/data` + + +Kafka Sink Target Details +------------------------- + +When using a Kafka Sink as a target, the following properties are available: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **name** + - ``string`` + - The name of the Kafka topic to write records to. + * - **sinkOptions** + - ``object`` + - Kafka configuration properties as key-value pairs. + +.. _dataflow_spec_ref_target_details_kafka_options: + +Kafka sinkOptions +~~~~~~~~~~~~~~~~~ + +The `kafkaOptions` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Option** + - **Type** + - **Description** + * - **kafka.bootstrap.servers** + - ``string`` + - The Kafka bootstrap servers connection string + * - **kafka.group.id** + - ``string`` + - The consumer group ID + * - **kafka.security.protocol** + - ``string`` + - Security protocol to use (defaults to SASL_SSL) + * - **kafka.ssl.truststore.location** + - ``string`` + - Location of the SSL truststore file + * - **kafka.ssl.truststore.password.accessKeyName** + - ``string`` + - Access key name for the truststore password + * - **kafka.ssl.truststore.password.secretScopeName** + - ``string`` + - Secret scope name containing the truststore password + * - **kafka.ssl.keystore.location** + - ``string`` + - Location of the SSL keystore file + * - **kafka.ssl.keystore.password.accessKeyName** + - ``string`` + - Access key name for the keystore password + * - **kafka.ssl.keystore.password.secretScopeName** + - ``string`` + - Secret scope name containing the keystore password + +Foreach Batch Sink Target Details +-------------------------------- + +When using a Foreach Batch Sink as a target, the following properties are available: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **name** + - ``string`` + - The name of the Foreach Batch Sink. + * - **type** + - ``string`` + - The type of the Foreach Batch Sink. Supported values: `["basic_sql", "python_function"]` + * - **config** + - ``object`` + - The configuration for the Foreach Batch Sink type. + +Foreach Batch basic_sql config +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `basic_sql` object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **database** + - ``string`` + - The database name for the target table. If not specified, the default database will be used. + * - **table** + - ``string`` + - The name of the target table. + * - **sqlPath** + - ``string`` + - The path to the SQL file to execute. + * - **sqlStatement** + - ``string`` + - The SQL statement to execute. This is an alternative to `sqlPath` and is mutually exclusive with it. + * - **partitionBy** (*optional*) + - ``array[string]`` + - List of columns to partition the table by. + * - **clusterBy** (*optional*) + - ``array[string]`` + - List of columns to cluster the table by. + * - **tableProperties** (*optional*) + - ``object`` + - A map of Delta table properties to set on the target table. + +.. note:: + The SELECT statement specified via the `sqlPath` or `sqlStatement` property must: + + * reference `micro_batch_view` as the source table in the FROM clause of the query that retrieves data from the sourve view. + * be a batch query i.e. do not wrap the `micro_batch_view` in a STREAM() function. + +Basic example: + + .. code-block:: sql + + SELECT + * + FROM micro_batch_view + +Subquery Example: + +.. code-block:: sql + + SELECT + * + FROM ( + + SELECT + * + FROM micro_batch_view + ) + +CTE Example: + + .. code-block:: sql + + WITH source_cte AS ( + SELECT + * + FROM micro_batch_view + ) + + SELECT + * + FROM source_cte + +Foreach Batch python_function config +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `python_function` config object contains the following properties: + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Field** + - **Type** + - **Description** + * - **functionPath** (*optional*) + - ``string`` + - The path to the Python file, which should live in the `python_functions` subdirectory. + * - **module** (*optional*) + - ``string`` + - The module to import the Python function from. + * - **tokens** (*optional*) + - ``object`` + - A map of tokens to pass to the Python function. + +.. important:: + + - You must select one of `functionPath` or `module`. diff --git a/docs/source/dataflow_spec_reference.rst b/docs/source/dataflow_spec_reference.rst new file mode 100644 index 0000000..2b38678 --- /dev/null +++ b/docs/source/dataflow_spec_reference.rst @@ -0,0 +1,19 @@ +Data Flow Spec Reference +######################## + +Key concepts that you should familiarize yourself with before reading this section are explained in the section: :ref:`concepts_data_flows`. + +A Data Flow Spec is a JSON file that defines the structure of a single data flow that is ultimately executed by a Spark Declarative Pipeline. + +.. important:: + + * A Data Flow Spec must adhere to the schemas defined by the framework, which is documented in this section. + * In general a single Data Flow Spec will be contained in one file and must be named with the suffix ``_main.json`` to be picked up by the framework. + * In the case of Flows Data Flow Specs, the Data Flow Spec can also be broken up into a main and one or more flow files. The main spec file will contain the main pipeline configuration and the flow spec file will contain the flow groups. This is explained further in the section: :doc:`splitting_dataflow_spec`. + +.. toctree:: + :maxdepth: 1 + + dataflow_spec_ref_main_standard + dataflow_spec_ref_main_flows + dataflow_spec_ref_main_materialized_views \ No newline at end of file diff --git a/docs/source/deploy_ci_cd.rst b/docs/source/deploy_ci_cd.rst new file mode 100644 index 0000000..e26c0a2 --- /dev/null +++ b/docs/source/deploy_ci_cd.rst @@ -0,0 +1,121 @@ +Setting up CI/CD +################# + + +This section describes the required general steps in a CI/CD pipeline to deploy the framework bundle. +For specific CI/CD platform example using GitHub Actions, see https://docs.databricks.com/en/dev-tools/bundles/ci-cd-bundles.html + +Prerequisites and Assumptions +---------------------------- +1. You have a Databricks access token for the CI/CD agent to authenticate to your Databricks workspace. +2. CI/CD agent has python and git installed. +3. CI/CD agent has access to the framework bundle repository. + + +Main Steps in a CI/CD Pipeline +----------------------------- +1. Install Databricks CLI + If the CI/CD agent you are using is not already using an image which has Databricks cli installed, you can install it using curl: + +.. code-block:: bash + + curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh + + +2. Confirm Databricks CLI is installed + +.. code-block:: bash + + databricks --version + + +3. Configure Databricks CLI + +.. code-block:: bash + + export DATABRICKS_HOST="https://" + + export DATABRICKS_TOKEN="" + or + export DATABRICKS_CLIENT_ID="" + export DATABRICKS_CLIENT_SECRET="" + +4. Clone the framework bundle repository + +.. code-block:: bash + + git clone https://github.com/databricks/framework-bundle.git + +5. Install dependencies for bundle validation + +.. code-block:: bash + + pip install -r requirements.txt + +6. Validate bundle configuration + +.. code-block:: bash + + databricks bundle validate + +7. Deploy current version + +.. code-block:: bash + + databricks bundle deploy --var="version=current" -t $ENVIRONMENT + +8. Deploy specific version for rollback + +.. code-block:: bash + + databricks bundle deploy --var="version=[version-number]" -t $ENVIRONMENT + + +Example CI/CD bash script +-------------------- +Here's an example deployment script that can be used in your CI/CD pipeline: + +.. code-block:: bash + + #!/bin/bash + set -e + + # Script arguments + ENVIRONMENT=${1:-dev} # Default to dev if not specified + FRAMEWORK_VERSION=${2:-1.2.3} # Default version for rollback if not specified + + # Install Databricks CLI if not already installed + if ! command -v databricks &> /dev/null; then + echo "Installing Databricks CLI..." + curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh + fi + + # Verify Databricks CLI installation + databricks --version + + # Verify required environment variables are set + if [ -z "$DATABRICKS_HOST" ] || { [ -z "$DATABRICKS_TOKEN" ] && [ -z "$DATABRICKS_CLIENT_ID" ]; }; then + echo "Error: Required environment variables not set" + echo "Please set:" + echo " DATABRICKS_HOST" + echo " DATABRICKS_TOKEN or (DATABRICKS_CLIENT_ID and DATABRICKS_CLIENT_SECRET)" + exit 1 + fi + + # Install dependencies + echo "Installing dependencies..." + pip install -r requirements.txt + + # Validate bundle configuration + echo "Validating bundle configuration..." + databricks bundle validate + + # Deploy current version + echo "Deploying current version to $ENVIRONMENT..." + databricks bundle deploy --var="version=current" -t $ENVIRONMENT + + # Deploy specific version for rollback + echo "Deploying version $FRAMEWORK_VERSION to $ENVIRONMENT for rollback..." + databricks bundle deploy --var="version=$FRAMEWORK_VERSION" -t $ENVIRONMENT + + echo "Deployment complete!" diff --git a/docs/source/deploy_enterprise.rst b/docs/source/deploy_enterprise.rst new file mode 100644 index 0000000..4342f54 --- /dev/null +++ b/docs/source/deploy_enterprise.rst @@ -0,0 +1,8 @@ +Deploy +====== + +.. toctree:: + :maxdepth: 1 + + deploy_ci_cd + deploy_versioning \ No newline at end of file diff --git a/docs/source/deploy_framework.rst b/docs/source/deploy_framework.rst new file mode 100644 index 0000000..741a061 --- /dev/null +++ b/docs/source/deploy_framework.rst @@ -0,0 +1,9 @@ +Deploy the Framework +===================== + +.. toctree:: + :maxdepth: 1 + + deploy_framework_bundle + deploy_ci_cd + feature_versioning_framework diff --git a/docs/source/deploy_framework_bundle.rst b/docs/source/deploy_framework_bundle.rst new file mode 100644 index 0000000..ad0f4bf --- /dev/null +++ b/docs/source/deploy_framework_bundle.rst @@ -0,0 +1,54 @@ +Deploying the Framework +########################### + +.. _local_deployment: + +Deploying From Your Local Machine +================================= + +The steps below will guide you through deploying the Lakeflow Framework to your Databricks workspace and assume you have cloned the Lakeflow Framework repository and are in the root directory of the repository. + +1. Ensure you have the Databricks CLI installed and configured. If not, please refer to the `Databricks CLI documentation `_. +2. Ensure the correct Databricks workspace is set as the workspace host field in the databricks.yml file or ensure no host is set to use the default host confgured on the profile used by the Databricks CLI (Databricks CLI should be configured with credentials to access this workspace). + databricks.yml file should look like this to add a host: + + .. code-block:: yaml + + bundle: + name: dlt_framework + + include: + - resources/*.yml + + targets: + dev: + mode: development + default: true + workspace: + host: https:// + +3. Run the following command from the root directoy to validate the Lakeflow Framework bundle: + + .. code-block:: console + + databricks bundle validate + + This command will run a series of checks to ensure the bundle is correctly set up and ready for deployment. +4. Run the following command to deploy the Lakeflow Framework to your Databricks workspace: + + .. code-block:: console + + databricks bundle deploy + +5. Once the deployment is successful, you should see the Lakeflow Framework bundle in your Databricks workspace. + To varify, you can go to your Databricks workspace and check if the bundle is present in the ``.bundle`` directory. + +.. Note:: + Databricks CLI will deploy the bundle to the default target workspace (usually dev by default) specified in the databricks.yml file. If you want to deploy the bundle to a different tagret, you can specify the target host using the ``-t`` option in the deploy command. + Databricks CLI will deploy using default credentials. If you want to deploy using a different set of credentials, you can specify the profile using the ``-p`` option in the deploy command. + +.. _ci_cd_deployment: + +Deploying via CI/CD +=================== +Please refer to the CI/CD documentation for more information on how to deploy the Lakeflow Framework samples using CI/CD. diff --git a/docs/source/deploy_pipeline_bundle.rst b/docs/source/deploy_pipeline_bundle.rst new file mode 100644 index 0000000..b421562 --- /dev/null +++ b/docs/source/deploy_pipeline_bundle.rst @@ -0,0 +1,36 @@ +Deploying a Pipeline Bundle +########################## + +.. _local_deployment: + +Deploying From Your Local Machine +================================= + +Once you have created a data pipeline bundle and deployed the Lakeflow Framework, you can deploy it to your Databricks workspace. + +1. Ensure you have the Databricks CLI installed and configured. If not, please refer to the `Databricks CLI documentation `_. +2. Ensure the correct Databricks workspace is set as the workspace host field in the databricks.yml file (Databricks CLI should be configured with credentials to access this workspace). +3. Run the following command to validate the data pipeline bundle: + + .. code-block:: console + + databricks bundle validate + This command will run a series of checks to ensure the bundle is correctly set up and ready for deployment. +4. Run the following command to deploy the data pipeline bundle to your Databricks workspace: + + .. code-block:: console + + databricks bundle deploy --var="pipeline_framework_path=/Workspace/Users//.bundle///current/files/src" + The owner is your databricks user id. + + +5. Once the deployment is successful, you should see the data pipeline bundle in your Databricks workspace. + + To varify, you can go to your Databricks workspace and check if the bundle is present in the ``.bundle`` directory. + Also verify that a Spark Declarative Pipeline has been created in the Databricks workspace with the name of the pipeline being the name provided in the resources yaml file for the Spark Declarative Pipeline. + +.. _ci_cd_deployment: + +Deploying via CI/CD +=================== +Please refer to the CI/CD documentation for more information on how to deploy the Lakeflow Framework samples using CI/CD. diff --git a/docs/source/deploy_samples.rst b/docs/source/deploy_samples.rst new file mode 100644 index 0000000..72379cc --- /dev/null +++ b/docs/source/deploy_samples.rst @@ -0,0 +1,177 @@ +The Samples +########### + +The Framework comes with extensive samples that demonstrate the use of the framework and Lakeflow concepts. At the time of writing, sample are organized into the following bundles: + +* Bronze +* Silver +* Gold +* Test Data and Orchestrator +* TPC-H + +The samples broadly break down into the following: + +.. list-table:: + :header-rows: 1 + + * - Sample Type + - Folder + - Description + * - **Base and Pattern Samples** + - - ``/src/dataflows/base_samples`` + - ``/src/dataflows/`` + - Bronze, Silver and Gold samples that demonstrate the patterns and data examples used in the :doc:`patterns` section of the documentation + * - **Feature Samples** + - ``/src/dataflows/feature_samples`` + - Sample per key feature + * - **Kafka Samples** + - ``/src/dataflows/kafka_samples`` + - Base Kafka, Confluent schema registry and SQL off Kafka samples + * - **TPC-H Sample** + - Separate bundle for TPC-H samples + - Based on TPC-H schema in UC samples catalog, reverse engineered to demonstrate end to end streaming data warehouse + +.. _local_sample_deployment: + +Deploying the Samples +--------------------- + +The samples can be deployed using the scripts located in the ``samples`` directory: + + * ``deploy.sh``: Deploys all the samples execpt for TPC-H. + * ``deploy_bronze.sh``: Deploys only the bronze samples. + * ``deploy_silver.sh``: Deploys only the silver samples. + * ``deploy_gold.sh``: Deploys only the gold samples. + * ``deploy_orchestrator.sh``: Deploys only the test data and orchestrator bundle. + * ``deploy_tpch.sh``: Deploys only the TPC-H sample. + +Prerequisites: + +* Databricks CLI installed and configured +* Lakeflow framework already deployed to your workspace (see :doc:`deploy_framework`) + +Interactive Deployment +^^^^^^^^^^^^^^^^^^^^^^^ + +1. Navigate to the samples directory in the root of the Framework repository: + + .. code-block:: console + + cd samples + +2. Run the desired deploy script: + + .. code-block:: console + + ./deploy.sh + +3. Follow the prompts to deploy the samples. + + * **Databricks username**: Your Databricks username in the workspace you are deploying to e.g. ``jane.doe@company.com``. + * **Databricks workspace**: The full URL of the workspace you are deploying to e.g. ``https://company.cloud.databricks.com``. + * **Databricks CLI profile**: The Databricks CLI profile you want to use for the deployment. Default: ``DEFAULT``. + * **Select Compute**: Select between Classic/Enhaced or Serverless compute (0=Enhanced, 1=Serverless). Default: ``1``. + * **UC Catalog**: The Unity Catalog you want to use for the deployment. Default: ``main``. + * **Schema Namespace**: The first part of the name for the bronze, silver and gold schemas. Default: ``lakeflow_samples``. + * **Logical environment**: The logical environment you want to use for the deployment e.g. ``_test``. + + .. important:: + + Always specify a logical environment when deploying the samples, this ensures you don't anyone elses existing samples in the workspace, as long as the logical environment is unique. + + Suggested naming: + + * Your initials, e.g Jane Doe would be ``_jd`` + * A Story ID, e.g ``123456`` would be ``_123456`` + * Your client name, e.g Company would be ``_client`` + * Others: business unit, team name, project name, etc... + +4. Once deployment is complete, you can find the deployed bundles under ``/Users//.bundle/`` + +Single Command line deployment: +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Navigate to the samples directory in the root of the Framework repository: + + .. code-block:: console + + cd samples + +2. Run the desired deploy script with required parameters: + + .. code-block:: console + + ./deploy.sh -u -h [-p ] [-c ] [-l ] [--catalog ] [--schema_namespace ] + + Parameters: + + * ``-u, --user``: Your Databricks username (required) + * ``-h, --host``: Databricks workspace host URL (required) + * ``-p, --profile``: Databricks CLI profile (optional). Default: ``DEFAULT``. + * ``-c, --compute``: The type of compute to use (0=Enhanced, 1=Serverless). Default: ``1``. + * ``-l, --logical_env``: Logical environment suffix for schema names (optional). Default: ``_test``. + * ``--catalog``: Unity Catalog name (optional). Default: ``main``. + * ``--schema_namespace``: Overide the first part of the name for the bronze, silver and gold schemas (optional). Default: ``lakeflow_samples``. + + For example: + + .. code-block:: console + + ./deploy.sh -u jane.doe@company.com -h https://company.cloud.databricks.com -l _jd -c 1 + +4. Once deployment is complete, you can find the deployed bundles under ``/Users//.bundle/`` + +Using the Samples +---------------- + +Test Data and Orchestrator +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The Test Data and Orchestrator bundle includes: + +* Test data initialization and load simulation +* Multiple job to simulate end to end runs of the samples + +**Jobs** + +After deployment you should find the following jobs in your workspace: + +* Lakeflow Framework Samples - Run 1 - Load and Schema Initialization +* Lakeflow Framework Samples - Run 2 - Load +* Lakeflow Framework Samples - Run 3 - Load +* Lakeflow Framework Samples - Run 4 - Load + +These will be prefixed with the target and your username and suffixed with the logical environment you provided when deploying the samples. + +For example: +``[dev jane_doe] Lakeflow Framework Samples - Run 1 - Load and Schema Initialization (_jd)`` + +To execute the samples, simply execute the jobs in order to simulate the end to end run of the samples over the test data. + +**Pipelines** + +You can also of course execute individual pipelines as well, these also follow a similiar name convention with ``Lakeflow Samples`` in the name. + +Destroying the Samples +---------------------- + +To destroy the samples, you can use the ``destroy.sh`` script following the command specified below. + +.. code-block:: console + + ./destroy.sh -h [-p ] [-l ] + +Parameters: + + * ``-h, --host``: Databricks workspace host URL (required) + * ``-p, --profile``: Databricks CLI profile (optional, defaults to DEFAULT) + * ``-l, --logical_env``: Logical environment suffix for schema names (optional) + +TPC-H Sample +------------ + +The TPC-H sample is based off the TPC-H schema in the UC catalog and reverse engineered to demonstrate end to end streaming data warehouse. + +To deploy the TPC-H sample, you can use the ``deploy_tpch.sh`` script following the same methods specified above. + +This sample is currently still being built with an initial cut targetted for Sept 2025. \ No newline at end of file diff --git a/docs/source/feature_auto_complete.rst b/docs/source/feature_auto_complete.rst new file mode 100644 index 0000000..a116155 --- /dev/null +++ b/docs/source/feature_auto_complete.rst @@ -0,0 +1,124 @@ +Auto Complete / Intellisense +============================= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`VS Code Settings` + * - **Databricks Docs:** + - NA + +Overview +-------- +The framework uses JSON Schema-based IntelliSense when creating or editing Data Flow specifications, which includes: + +- Auto-completion (suggesting keys/values) +- Validation (checking for errors based on the schema) +- Quick Fixes (where applicable) + +Autocompletion Example +~~~~~~~~~~~~~~~~~~~~~~ +*Suggesting keys* + +.. image:: images/screenshot_intellisense_keys.png + :alt: Intellisense Keys + +*Suggesting values* + +.. image:: images/screenshot_intellisense_values.png + :alt: Intellisense Values + +Validation Example +~~~~~~~~~~~~~~~~~ + +*Schema Validation* + +.. image:: images/screenshot_validation_keys.png + :alt: Validation Keys + +*Value Validation* + +.. image:: images/screenshot_validation_values.png + :alt: Validation Values + +Configuration for VS Code +------------------------- +To enable Auto Complete / Intellisense in VS Code, you need to add the JSON schemas to your workspace or user ``settings.json`` file. +Detailsed instructions are available in the `VS Code - JSON schema and Settings `_ documentation. + +To open the ``settings.json`` file, you can use the command palette (Ctrl+Shift+P or Cmd+Shift+P on Mac) and search for "JSON Schema". + +.. image:: images/screenshot_vscode_json_settings.png + :alt: Open Settings (JSON) + +Add the following code into your ``settings.json`` replacing ```` with the path to the Lakeflow Framework project on your local machine unless you are adding this object directly to the ``settings.json`` in your ``.vscode`` directory in the Lakeflow Framework project in which case you should remove ````.: + +.. code-block:: json + + { + "json.schemas": [ + { + "fileMatch": [ + "*_flow.json" + ], + "url": "/src/schemas/flow_group.json" + }, + { + "fileMatch": [ + "*_main.json" + ], + "url": "/src/schemas/main.json" + }, + { + "fileMatch": [ + "*_dqe.json" + ], + "url": "/src/schemas/expectations.json" + }, + { + "fileMatch": [ + "*_secrets.json" + ], + "url": "/src/schemas/secrets.json" + } + ] + } + +Example ``settings.json`` file: + +.. code-block:: json + + { + "python.analysis.extraPaths": [ + "src" + ], + "json.schemas": [ + { + "fileMatch": [ + "*flow.json" + ], + "url": "/Users/erik.seefeld/Documents/dev_work/dlt_framework/src/schemas/flow_group.json" + }, + { + "fileMatch": [ + "*main.json" + ], + "url": "/Users/erik.seefeld/Documents/dev_work/dlt_framework/src/schemas/main.json" + }, + { + "fileMatch": [ + "*_dqe.json" + ], + "url": "/Users/erik.seefeld/Documents/dev_work/dlt_framework/src/schemas/expectations.json" + }, + { + "fileMatch": [ + "*_secrets.json" + ], + "url": "/Users/erik.seefeld/Documents/dev_work/dlt_framework/src/schemas/secrets.json" + } + ] + } diff --git a/docs/source/feature_builder_parallelization.rst b/docs/source/feature_builder_parallelization.rst new file mode 100644 index 0000000..a63b639 --- /dev/null +++ b/docs/source/feature_builder_parallelization.rst @@ -0,0 +1,106 @@ +Builder Parallelization +======================= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Global` :bdg-success:`Pipeline Configuration` + * - **Databricks Docs:** + - NA + +Overview +-------- +The Lakeflow Framework supports parallel processing during both dataflow specification building and pipeline initialization phases to improve performance and reduce initialization time. +This feature utilizes ThreadPoolExecutor to process multiple operations concurrently, which is particularly beneficial for: + +- Large pipelines with many dataflow specifications +- Complex dataflow specifications requiring validation and transformation + +The framework automatically detects the number of logical CPU cores available on the Spark driver using ``os.cpu_count()`` and sets the default max workers to ``cores - 1`` to reserve one core for system operations. This ensures optimal performance while maintaining system stability. If CPU core detection fails, the framework falls back to a default of 1 worker thread. + +Parameters +---------- + +.. list-table:: + :header-rows: 1 + :widths: auto + + * - **Parameter** + - **Type** + - **Default Value** + - **Phase** + - **Description** + * - ``override_max_workers`` + - Integer + - NA + - Pipeline Initialization + - Should only be used if the auto-detected default is not working. Controls the maximum number of worker threads used when: + + - Reading dataflow specification files from the filesystem + - Validating dataflow specifications against schemas + - Applying dataflow specification version mappings and transformations + - Creating DataFlow objects from dataflow specifications + - Initializing SDP tables, views, and streaming tables + * - ``pipeline_builder_disable_threading`` + - Boolean + - False + - Pipeline Initialization + - Disables threading when creating DataFlow objects + +Configuration +------------- + +Global Configuration +~~~~~~~~~~~~~~~~~~~~~ +Configure these parameters globally for all pipelines in your ``src/config/global.json|yaml`` file: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 7 + + { + "spark_config": { + "spark.databricks.sql.streamingTable.cdf.applyChanges.returnPhysicalCdf": true, + "pipelines.streamingFlowReadOptionsEnabled": true, + "pipelines.externalSink.enabled": true + }, + "override_max_workers": 4, + "mandatory_table_properties": { + "delta.logRetentionDuration": "interval 45 days", + "delta.deletedFileRetentionDuration": "interval 45 days", + "delta.enableRowTracking": "true" + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 5 + + spark_config: + spark.databricks.sql.streamingTable.cdf.applyChanges.returnPhysicalCdf: true + pipelines.streamingFlowReadOptionsEnabled: true + pipelines.externalSink.enabled: true + override_max_workers: 4 + mandatory_table_properties: + delta.logRetentionDuration: interval 45 days + delta.deletedFileRetentionDuration: interval 45 days + delta.enableRowTracking: 'true' + +Troubleshooting +--------------- + +**Debugging Core Detection:** + +The framework logs the detected core count and calculated default max workers during initialization: + +.. code-block:: text + + INFO - Logical cores (threads): 4 + INFO - Default max workers: 3 diff --git a/docs/source/feature_cdc.rst b/docs/source/feature_cdc.rst new file mode 100644 index 0000000..b564eb1 --- /dev/null +++ b/docs/source/feature_cdc.rst @@ -0,0 +1,47 @@ +Change Data Capture (CDC) +========================= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta-live-tables/cdc.html + +Lakeflow Declarative Pipelines simplifies change data capture (CDC) with the AUTO CDC and AUTO CDC FROM SNAPSHOT APIs. + +* Use AUTO CDC to process changes from a change data feed (CDF). +* Use AUTO CDC FROM SNAPSHOT to process changes in database snapshots. + +Both AUTO CDC and AUTO CDC FROM SNAPSHOT support: + +* Updating tables using SCD type 1 and type 2 + * Use SCD type 1 to update records directly. History is not retained for updated records. + * Use SCD type 2 to retain a history of records, either on all updates or on updates to a specified set of columns. +* Out of order records +* Retaining a history of records, either on all updates or on updates to a specified set of columns. + +Use of AUTO CDC FROM SNAPSHOT +----------------------------- +There are two ways to use the AUTO CDC FROM SNAPSHOT feature: + +1. To process changes from a snapshot of a table/view periodically (**periodic**) + + - This can be used in both ``standard`` and ``flow`` data flow types (refer to :ref:`dataflow types` for more information). + - For this to be enabled, the ``cdcSnapshotSettings`` object must be configured with the ``snapshotType`` set to ``periodic`` and must be chained so that a source is available to ingest the snapshots. + + A new snapshot is ingested with each pipeline update, and the ingestion time is used as the snapshot version. When a pipeline is run in continuous mode, multiple snapshots are ingested with each pipeline update on a period determined by the trigger interval setting for the flow that contains the AUTO CDC FROM SNAPSHOT processing + +2. To process historical snapshot from a file or table based source which has multiple snapshots available at any given time (**historical**) + + - This can only be used in ``standard`` data flow types. + - For this to be enabled, the ``cdcSnapshotSettings`` object must be configured with the ``snapshotType`` set to ``historical`` and must not have a source configured at the root level of the Data Flow Spec, the source is instead configured at the ``cdcSnapshotSettings`` level (refer to :ref:`dataflow_spec_ref_cdc` for more information). + - This will process all the historical snapshots available at the time of the pipeline run and any new snapshots will be ingested with each pipeline update. + +Configuration +------------- + +Set as an attribute when creating your Data Flow Spec, refer to the :doc:`dataflow_spec_ref_cdc` section of the :doc:`dataflow_spec_reference` documentation for more information. diff --git a/docs/source/feature_cdf.rst b/docs/source/feature_cdf.rst new file mode 100644 index 0000000..e9845b1 --- /dev/null +++ b/docs/source/feature_cdf.rst @@ -0,0 +1,137 @@ +Change Data Feed (CDF) +====================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta/delta-change-data-feed + + +Overview +-------- +Change Data Feed (CDF) is a Delta Lake feature that enables tracking of row-level changes between versions of a Delta table. +The framework provides built-in support for CDF to help track and process data changes efficiently. + +Configuration +------------- + +Enabling CDF on a Table +~~~~~~~~~~~~~~~~~~~~~~~ + +To enable CDF on a target table or staging table, you need to add the ``delta.enableChangeDataFeed`` property to the ``tableProperties`` object of the ``targetDetails`` object in your Data Flow Spec and set it to ``true``. For example: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 6 + + { + "targetFormat": "delta", + "targetDetails": { + "table": "my_table", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_schema.json" + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 5 + + targetFormat: delta + targetDetails: + table: my_table + tableProperties: + delta.enableChangeDataFeed: 'true' + schemaPath: customer_schema.json + +Reading From CDF in a View +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To read from CDF, you need to do so via a view. When specifying a view in your Data Flow Spec, set the ``cdfEnabled`` attribute to ``true``. There are different types of dataflow specs and ways to specify a view, refer to the :doc:`dataflow_spec_reference` documentation for more information. + +Standard Dataflow Spec example: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 6 + + { + "sourceViewName": "v_customer_address", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 5 + + sourceViewName: v_customer_address + sourceDetails: + database: '{bronze_schema}' + table: customer_address + cdfEnabled: true + +Flows Dataflow Spec example: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 9 + + { + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 8 + + views: + v_customer: + mode: stream + sourceType: delta + sourceDetails: + database: '{bronze_schema}' + table: customer + cdfEnabled: true + +Important Considerations: +------------------------- + +Refer to the Databricks `documentation `_ for information on: + +* Concepts +* Schema / CDF columns +* Change types +* Limitations diff --git a/docs/source/feature_data_quality_expectations.rst b/docs/source/feature_data_quality_expectations.rst new file mode 100644 index 0000000..07f447c --- /dev/null +++ b/docs/source/feature_data_quality_expectations.rst @@ -0,0 +1,115 @@ +Data Quality - Expectations +============ + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta-live-tables/expectations.html + +Define expectations in your Data Flow Spec to apply quality constraints that validate data as it flows through ETL pipelines. + +Configuration +------------- + +Defining Expectations +~~~~~~~~~~~~~~~~~~~~~ +In a Pipeline Bundle bundle, expectations are defined at a table level and must be located in an ``expectations`` sub-folder of the directory containing the corresponding Data Flow Spec. Examples: + +* flat structure: ``src/dataflows/expectations/_dqe.json`` +* organized by target table: ``src/dataflows//expectations/_dqe.json`` + +.. note:: + + The expectations file name can have any name (with a .json extension), but best practice is to use the pattern ``_dqe.json``. + +The schema for the expectations file is defined below, in the :ref:`expectations-schema` section. + +Enabling and Referencing Expectations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Once the expectations are defined, they can be enabled and referenced in the Data Flow Spec, per the following sections of this documentation: + +* :ref:`Flows Data Flow - Data Quality Configuration ` +* :ref:`Standard Data Flow - Data Quality Configuration ` + + +.. _expectations-schema: + +Expectations Schema +-------------------- + +.. code-block:: json + + { + "": [ + { + "name": "", + "contraint": "SQL constraint", + "tag": "", + "enabled": "" + } + ] + } + +.. list-table:: + :header-rows: 1 + + * - Field + - Type + - Description + * - **type of expectation** + - ``string`` + - The type of expectation. Valid types are: + + 1. **expect**: Use the expect operator when you want to keep records that violate the expectation. Records that violate the expectation are added to the target dataset along with valid records + 2. **expect_or_drop** : Use the expect_or_drop operator when you want to drop records that violate the expectation. Records that violate the expectation are dropped from the target dataset + 3. **expect_or_fail**: Use the expect_or_fail operator when you want to fail the dataflow if any records violate the expectation. The dataflow will fail and stop execution if any records violate the expectation + + * - **name** + - ``string`` + - The name of the expectation, any unique name can be given. + * - **constraint** + - ``string`` + - The SQL constraint that defines the expectation. The constraint should be a valid SQL query that returns a boolean value. If the constraint returns `true`, the record is considered valid; else, it is considered invalid. + * - **tag** (*optional*) + - ``string`` + - The tag is used to group expectations together. + * - **enabled** (*optional*) + - ``boolean`` + - Specifies if the expectation is enabled or not. If the expectation is enabled, it will be validated; otherwise, it will be ignored. If not specified, the expectation will be enabled by default. + +Examples +-------- + +.. code-block:: json + + { + "expect": [ + { + "name": "expectation_1", + "constraint": "column_1 > 0", + "tag": "tag_1", + "enabled": true + } + ], + "expect_or_drop": [ + { + "name": "expectation_2", + "constraint": "column_2 < 100", + "tag": "tag_2", + "enabled": true + } + ], + "expect_or_fail": [ + { + "name": "expectation_3", + "constraint": "column_3 != 'NULL'", + "tag": "tag_3", + "enabled": true + } + ] + } diff --git a/docs/source/feature_data_quality_quarantine.rst b/docs/source/feature_data_quality_quarantine.rst new file mode 100644 index 0000000..5437d06 --- /dev/null +++ b/docs/source/feature_data_quality_quarantine.rst @@ -0,0 +1,28 @@ +Data Quality - Quarantine +============ + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - NA + +The Lakeflow Framework provides a quarantine feature that allows you to quarantine records that violate defined pipeline expectations. + +There are multiple ways to handle quarantined records and these can be configured using the ``quarantineMode`` property in the Data Flow Spec. Available options are: + +- **off**: The quarantine feature is disabled +- **flag**: The quarantined records are flagged in the target table +- **table**: The quarantined records are stored in a separate quarantine table + +If the `quarantineMode` property is set to `table`, the quarantineTargetDetails property can be set in the Data Flow Spec to define the details of the quarantine table, otherwise the quarantine table will be derived based of the main target table. + + +Configuration +------------- + +Set as an attribute when creating your Data Flow Spec, refer to the :doc:`dataflow_spec_ref_data_quality` documentation for more information. \ No newline at end of file diff --git a/docs/source/feature_direct_publishing_mode.rst b/docs/source/feature_direct_publishing_mode.rst new file mode 100644 index 0000000..54c32f8 --- /dev/null +++ b/docs/source/feature_direct_publishing_mode.rst @@ -0,0 +1,36 @@ +Direct Publishing Mode +====================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Pipeline` :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - - https://docs.databricks.com/aws/en/dlt/target-schema + - https://docs.databricks.com/aws/en/dlt/configure-pipeline + +.. Note:: + Direct publishing mode should be enabled by default for all new pipelines by specifing the ``schema`` field in the pipeline resource file. Sample pipeline have been updated to have direct publishing mode enabled. To enable direct publishing mode on existing pipelines, the pipeline needs to be destroyed and redeployed with the ``target`` field updated to ``schema`` in the pipeline resource file. + +.. Warning:: + Destroying and redeploying a pipeline to enable direct publishing mode will result in the pipeline tables being dropped and recreated and therefore, will reprocess all the data on the next run of the pipeline. + + +The Framework supports: + +* `Publishing to multiple catalogs and schemas from a single pipeline `_ +* `Legacy Live Schema Publishing `_ + + +This is configured in two places: + +1. In the Data Flow Spec, under the ``targetDetails`` section. +2. In the pipeline resource file. Databricks Spark Declarative Pipeline Settings documentation: https://docs.databricks.com/aws/en/dlt/configure-pipeline + + +Refer to the section :doc:`build_pipeline_bundle_steps` for more information. + + diff --git a/docs/source/feature_liquid_clustering.rst b/docs/source/feature_liquid_clustering.rst new file mode 100644 index 0000000..2f02cd4 --- /dev/null +++ b/docs/source/feature_liquid_clustering.rst @@ -0,0 +1,40 @@ +Liquid Clustering +================ + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta/clustering.html + +Liquid clustering is a Databricks feature that replaces traditional table partitioning and ``ZORDER`` to simplify data layout decisions and optimize query performance. It provides flexibility to redefine clustering keys without rewriting existing data, allowing data layout to evolve alongside analytic needs over time. + +Use Cases +--------- + +Databricks recommends liquid clustering for all new Delta tables, which includes both Streaming Tables (STs) and Materialized Views (MVs). The following are examples of scenarios that benefit from clustering: + +* Tables often filtered by high cardinality columns. +* Tables with significant skew in data distribution. +* Tables that grow quickly and require maintenance and tuning effort. +* Tables with concurrent write requirements. +* Tables with access patterns that change over time. +* Tables where a typical partition key could leave the table with too many or too few partitions. + +Selecting Clustering Keys +------------------------- + +If ``clusterByAuto`` is set to ``true``, the clustering keys will be automatically selected based on the data in the table. Otherwise, the clustering keys can be specified in the ``clusterByColumns`` attribute. +If both ``clusterByAuto`` and ``clusterByColumns`` are set, the columns specified in ``clusterByColumns`` will be used as the initial clustering keys and the keys will be automatically updated based on the data in the table over time. + +Please refer to the `Liquid Clustering documentation `_ for more information on selecting clustering keys. + +Configuration +------------- + +Enabled by setting the ``clusterBy`` attribute as documented in :ref:`dataflow_spec_ref_target_details_delta`. +Additionally or alternatively, the ``clusterByAuto`` attribute can be enabled to allow for automatic clustering key selection as documented in :ref:`dataflow_spec_ref_target_details_delta`. diff --git a/docs/source/feature_logging.rst b/docs/source/feature_logging.rst new file mode 100644 index 0000000..eeda0bb --- /dev/null +++ b/docs/source/feature_logging.rst @@ -0,0 +1,102 @@ +Logging +======= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Pipeline` + * - **Databricks Docs:** + - NA + +The Lakeflow Framework provides structured logging capabilities to help track pipeline execution and troubleshoot issues. Logging is implemented using Python's standard ``logging`` module with custom configuration. + +Log Levels +---------- + +The framework supports standard Python logging levels: + +- DEBUG: Detailed information for debugging +- INFO: General information about pipeline execution +- WARNING: Warning messages for potential issues +- ERROR: Error messages for failed operations +- CRITICAL: Critical errors that may cause pipeline failure + +Configuration +------------- + +The default log level for all pipelines is ``INFO``. +To specify a different log level, you can set the ``logLevel`` parameter in the `Configuration` section of a Spark Declarative Pipeline. +You can do this in one in one of the two ways described below. + +Setting the Log Level in the Pipeline Yaml +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The log level can be configured when creating a pipeline yaml in the resources folder of a pipeline bundle. +This is done by adding the ``logLevel`` parameter in the configuration section of the pipeline.yaml, per the below screenshot. + +.. image:: images/screenshot_pipeline_log_level_yaml.png + +Setting the Log Level in the Databricks UI +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The log level can also be manually set at any time in the Databricks UI. +To do so, browse to your desired Pipeline, open the Pipeline setting and add the ``logLevel`` in the `Advanced Configuration` section as shown below: + +.. image:: images/screenshot_pipeline_log_level_ui.png + +Permissions to View Logs +^^^^^^^^^^^^^^^^^^^^^^^^ + +By default, only the pipeline owner has permissions to view the logs for a given pipeline execution. + +To grant other users access to the logs, you must configure the add the below spark configuration to the Framework using the :doc:`feature_spark_configuration` feature of the Framework. + +.. code-block:: text + + "spark.databricks.acl.needAdminPermissionToViewLogs": "false" + +This is documented in the Databricks documentation here: https://docs.databricks.com/en/compute/clusters-manage.html + +Viewing the Logs +---------------- + +The logs can be viewed in the Databricks UI by: + +1. Browsing to the desired Pipeline. +2. Selecting the desired Update ID (pipeline execution). +3. Selecting the `Update` tab on the right hand side of the UI and then clicking on the `Logs` link at the bottom of the tab. + + .. image:: images/screenshot_logs_viewing_1.png + +4. A new browser tab will open displaying the log in the STDOUT section as shown below: + + .. image:: images/screenshot_logs_viewing_2.png + +Example Log Messages +------------------ + +The framework logs various types of information: + +Pipeline Initialization: + +.. code-block:: text + + 2025-02-06 04:05:46,161 - DltFramework - INFO - Initializing Pipeline... + 2025-02-06 04:05:46,772 - DltFramework - INFO - Retrieving Global Framework Config From: {path} + 2025-02-06 04:05:46,908 - DltFramework - INFO - Retrieving Pipeline Configs From: {path} + +Flow Creation: + +.. code-block:: text + + 2025-02-06 04:05:48,254 - DltFramework - Creating Flow: flow_name + 2025-02-06 04:05:48,254 - DltFramework - Creating View: view_name, mode: stream, source type: delta + +Error Handling: + +.. code-block:: text + + 2025-02-06 04:06:26,527 - ERROR - DltFramework - Failed to process Data Flow Spec: {error_details} diff --git a/docs/source/feature_logical_environment.rst b/docs/source/feature_logical_environment.rst new file mode 100644 index 0000000..0911191 --- /dev/null +++ b/docs/source/feature_logical_environment.rst @@ -0,0 +1,93 @@ +Logical Environments +==================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Bundle` + * - **Databricks Docs:** + - NA + +The logical environment feature allows you to specify additional naming separation for Pipeline and Unity Catalog resources. This allows for fine grain separation of resources when working with larger teams in development and SIT environments. + +The logical environment is appended as a suffix to the Pipeline name and the Unity Catalog resource names at Bundle deployment time. + +Configuration +------------- + +The logical environment is configured in two places. + +1. **databricks.yml** Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Defined in the variables section: + +.. code-block:: yaml + + variables: + logical_env: + description: The logical environment + default: "" + +The value is passed / retrieved at bundle deployment time. The section :ref:`feature_logical_env_passing` below, describes the different ways to pass the logical environment value. + +2. **pipeline resource YAML files** Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To leverage the pipeline resource YAML when deploying a Pipeline, the logical environment must be specified in a pipelines resource YAML file(s) in the ``logicalEnv`` section. For example: + +.. code-block:: yaml + :emphasize-lines: 4,17 + + resources: + pipelines: + dlt_framework_samples_bronze_pipeline: + name: dlt_framework_samples_bronze_pipeline${var.logical_env} + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: /Workspace/${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + logicalEnv: ${var.logical_env} + workspace.host: ${var.workspace_host} + + + +.. _feature_logical_env_passing: + +Passing the logical environment +-------------------------------- + +The logical environment can be passed in one of three ways at bundle deployment time. These also apply to any CI/CD pipeline that is used to deploy the bundle. + +1. Environment Variable +~~~~~~~~~~~~~~~~~~~~~~~ + +An environment variable can be set prior to executing the ``databricks bundle deploy`` command. For example: + +.. code-block:: bash + + export BUNDLE_VAR_logical_env=my_logical_env_suffix + +Databricks reference: https://docs.databricks.com/en/dev-tools/bundles/variables.html#set-a-variables-value + +2. Command Line Argument +~~~~~~~~~~~~~~~~~~~~~~~ + +The logical environment can be directly specified via the ``databricks bundle deploy`` command. For example: + +.. code-block:: bash + + databricks bundle deploy --var="logical_env=my_logical_env_suffix" + +Databricks reference: https://docs.databricks.com/en/dev-tools/bundles/variables.html#set-a-variables-value diff --git a/docs/source/feature_mandatory_table_properties.rst b/docs/source/feature_mandatory_table_properties.rst new file mode 100644 index 0000000..5495962 --- /dev/null +++ b/docs/source/feature_mandatory_table_properties.rst @@ -0,0 +1,120 @@ +Mandatory Table Properties +========================= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Framework Bundle` + * - **Configuration Scope:** + - :bdg-info:`Global` + * - **Databricks Docs:** + - https://docs.databricks.com/aws/en/delta/table-properties + +The Mandatory Table Properties feature allows you to define a set of table properties that will be automatically applied to all tables created by the Framework. This ensures consistent table configurations across your data lakehouse. + +Configuration +------------- + +| **Scope: Global** +| Mandatory table properties are defined in the global configuration file located at ``src/config/global.json|yaml`` under the ``mandatory_table_properties`` section. + +Configuration Schema +------------------ + +The mandatory table properties configuration are defined as key-value pairs as follows: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "mandatory_table_properties": { + "": "", + ... + } + } + + .. tab:: YAML + + .. code-block:: yaml + + mandatory_table_properties: + : + ... + +Common Properties +--------------- + +Some commonly used table properties include: + +.. list-table:: + :header-rows: 1 + + * - Property + - Description + - Example Value + * - **delta.autoOptimize.optimizeWrite** + - Enables write optimization for the table + - ``true`` + * - **delta.autoOptimize.autoCompact** + - Enables automatic file compaction + - ``true`` + * - **delta.enableChangeDataFeed** + - Enables Change Data Feed for the table + - ``true`` + * - **delta.columnMapping.mode** + - Specifies the column mapping mode + - ``"name"`` + * - **comment** + - Adds a description to the table + - ``"This table contains..."`` + +Example Configuration +------------------- + +Here's an example configuration that sets some common table properties: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "mandatory_table_properties": { + "delta.autoOptimize.optimizeWrite": "true", + "delta.autoOptimize.autoCompact": "true", + "delta.enableChangeDataFeed": "true", + "delta.columnMapping.mode": "name" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + mandatory_table_properties: + delta.autoOptimize.optimizeWrite: 'true' + delta.autoOptimize.autoCompact: 'true' + delta.enableChangeDataFeed: 'true' + delta.columnMapping.mode: name + +.. admonition:: Note + :class: note + + - All property values must be specified as strings, even for boolean values + - Properties defined here will be applied to all tables created by the Framework + - These properties cannot be overridden at the individual table level + +.. admonition:: Best Practice + :class: note + + It's recommended to: + + - Enable auto-optimize features for better performance + - Enable Change Data Feed if you need to track changes + - Use column mapping to ensure schema evolution compatibility + - Add meaningful table comments for documentation \ No newline at end of file diff --git a/docs/source/feature_materialized_views.rst b/docs/source/feature_materialized_views.rst new file mode 100644 index 0000000..1613c86 --- /dev/null +++ b/docs/source/feature_materialized_views.rst @@ -0,0 +1,293 @@ +Materialized Views +================ + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Data Flow Spec` + * - **Databricks Docs:** + - - `Materialized Views `_ + - `Delta Live Tables Python Reference `_ + - `Delta Live Tables SQL Reference `_ + +Overview +-------- +Materialized Views are the precomputed results of a query stored in a Table. Please refer to the above documentation for full details on Materialized Views and how they work. + +Key Features: + +- Automatic updates based on pipeline schedule/triggers +- Guaranteed consistency with source data. All required data is processed, even if it arrives late or out of order. +- Incremental refresh optimization. Databricks will try to choose the appropriate strategy that minimizes the cost of updating a materialized view. +- Ideal for transformations and aggregations +- Pre-computation of slow queries +- Optimization for frequently used computations + +.. admonition:: Important + :class: warning + + To support Incremental refresh, some keywords and clauses require row-tracking to be enabled on the queried data sources. + Refer to the the following links for details on: + - `Incremental Refresh `_ + - `Row Tracking `_ + +Sample Bundle +------------- + +A sample is available in: + + - the ``bronze_sample`` bundle in the ``src/dataflows/feature_samples`` folder in the ``materialized_views_main.json|yaml`` file + - the ``gold_sample`` bundle in the ``src/dataflows/base_samples`` folder in the ``materialized_views_main.json|yaml`` file + + +Data Flow Spec Configuration +--------------------------- + +Materialized Views are must be configured in the Materialized Views Data Flow Spec Type. This Data Flow Specification is defined in the :doc:`dataflow_spec_ref_main_materialized_views` documentation. + +Data Flow Spec Configuration Schema +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following schema details the configuration for a Materialized View Data Flow Spec: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "feature_materialized_views", + "dataFlowGroup": "feature_samples", + "dataFlowType": "materialized_view", + "materializedViews": { + "mv_name": { + "sourceView": { + "sourceViewName": "", + "sourceType": "[delta|python|sql]", + "sourceDetails": {} + }, + "sqlPath": "", + "sqlStatement": "", + "tableDetails": { + "database": "", + "schemaPath": "", + "tableProperties": {}, + "path": "", + "partitionColumns": [], + "clusterByColumns": [] + }, + "dataQualityExpectationsEnabled": false, + "dataQualityExpectationsPath": "", + "quarantineMode": "off", + "quarantineTargetDetails": {} + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: feature_materialized_views + dataFlowGroup: feature_samples + dataFlowType: materialized_view + materializedViews: + mv_name: + sourceView: + sourceViewName: '' + sourceType: '[delta|python|sql]' + sourceDetails: {} + sqlPath: '' + sqlStatement: '' + tableDetails: + database: '' + schemaPath: '' + tableProperties: {} + path: '' + partitionColumns: [] + clusterByColumns: [] + dataQualityExpectationsEnabled: false + dataQualityExpectationsPath: '' + quarantineMode: 'off' + quarantineTargetDetails: {} + +Source Type Details +~~~~~~~~~~~~~~~~~~~ + +Materialized Views can be configured in your Data Flow Spec in three ways: + +1. **Using a Source View** + - Define a source view that the materialized view will be based on + - Supports Delta, Python, and SQL source types + - Example configuration: + + .. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "sourceView": { + "sourceViewName": "v_customer", + "sourceType": "delta", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + sourceView: + sourceViewName: v_customer + sourceType: delta + sourceDetails: + database: '{staging_schema}' + table: customer + cdfEnabled: true + +2. **Using SQL Path** + - Reference a SQL file containing the query for the materialized view + - Example configuration: + + .. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "sqlPath": "./customer_mv.sql" + } + + .. tab:: YAML + + .. code-block:: yaml + + sqlPath: ./customer_mv.sql + +3. **Using SQL Statement** + - Directly specify the SQL query for the materialized view + - Example configuration: + + .. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "sqlStatement": "SELECT * FROM {staging_schema}.customer" + } + + .. tab:: YAML + + .. code-block:: yaml + + sqlStatement: SELECT * FROM {staging_schema}.customer + +Additional Configuration Options +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Materialized Views support several additional configuration options: + +- **Table Details**: Configure the target table properties + - Database + - Schema path + - Table properties + - Path + - Partition columns + - Cluster by columns + +- **Data Quality Expectations** + - Enable data quality checks + - Specify expectations path + - Configure quarantine mode (off, flag, table) + +- **Quarantine Configuration** + - Set quarantine mode + - Configure quarantine target details + +Example Configuration +------------------- + +A complete example of a materialized view configuration: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "feature_materialized_views", + "dataFlowGroup": "feature_samples", + "dataFlowType": "materialized_view", + "materializedViews": { + "mv_from_source_view": { + "sourceView": { + "sourceViewName": "v_mv_source_view", + "sourceType": "delta", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + } + } + }, + "mv_from_sql_path": { + "sqlPath": "./mv_from_sql_path.sql" + }, + "mv_from_sql_statement": { + "sqlStatement": "SELECT * FROM {staging_schema}.customer" + }, + "mv_with_quarantine": { + "sqlStatement": "SELECT * FROM {staging_schema}.customer_address", + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta" + } + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: feature_materialized_views + dataFlowGroup: feature_samples + dataFlowType: materialized_view + materializedViews: + mv_from_source_view: + sourceView: + sourceViewName: v_mv_source_view + sourceType: delta + sourceDetails: + database: '{staging_schema}' + table: customer + cdfEnabled: true + mv_from_sql_path: + sqlPath: ./mv_from_sql_path.sql + mv_from_sql_statement: + sqlStatement: SELECT * FROM {staging_schema}.customer + mv_with_quarantine: + sqlStatement: SELECT * FROM {staging_schema}.customer_address + dataQualityExpectationsEnabled: true + dataQualityExpectationsPath: ./customer_address_dqe.json + quarantineMode: table + quarantineTargetDetails: + targetFormat: delta + +For more detailed information about configuration options, refer to the :doc:`dataflow_spec_reference` documentation. \ No newline at end of file diff --git a/docs/source/feature_multi_source_streaming.rst b/docs/source/feature_multi_source_streaming.rst new file mode 100644 index 0000000..5097e13 --- /dev/null +++ b/docs/source/feature_multi_source_streaming.rst @@ -0,0 +1,48 @@ +Multi-Source Streaming +===================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta-live-tables/flows.html + +Delta Live Tables supports processing that requires reading data from multiple streaming sources to update a single streaming table via: + +* **Append Flows** - Append streams from multiple sources to a single streaming table. +* **Change Flows** - Process CDC events from multiple sources to a single streaming table, using the CDC API's. + +The Lakeflow Framework implements this capability via the Data Flow Spec using the concept of flow groups and flows. + +Configuration +------------- + +In a Pipeline Bundle bundle, multi-source streaming is configured in the Data Flow Spec using the ``flow_groups`` and ``flows`` attributes. +This is documented in :ref:`_flow-group-configuration` and :ref:`_flow-configuration`. + +Key Features +----------- +- Write to a single streaming table from multiple source streams +- Add or Remove streaming sources without requiring a full table refresh +- Support for backfilling historical data +- Alternative to UNION operations for combining multiple sources +- Maintain separate checkpoints for each flow + +Important Considerations +---------------------- +- Flow names are used to identify streaming checkpoints +- Renaming an existing flow creates a new checkpoint +- Flow names must be unique within a pipeline +- Data quality expectations should be defined on the target table, not in flow definitions +- Append flows provide more efficient processing compared to UNION operations for combining multiple sources +- Append SQL flows do not support quarantine table mode (they do support quarantine flag mode). This is because quarantine table mode requires a source view. + +See Also +-------- +- :doc:`feature_source_target_types` +- :doc:`dataflow_spec_ref_source_details` +- :doc:`dataflow_spec_ref_target_details` \ No newline at end of file diff --git a/docs/source/feature_operational_metadata.rst b/docs/source/feature_operational_metadata.rst new file mode 100644 index 0000000..97b0f4f --- /dev/null +++ b/docs/source/feature_operational_metadata.rst @@ -0,0 +1,193 @@ +Operational Metadata +==================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Framework Bundle` + * - **Configuration Scope:** + - :bdg-info:`Global` + * - **Databricks Docs:** + - NA + +Operational metadata is data that describes a given data load and pipeline execution, for example: + +* Ingestion date +* Execution date +* Update ID / Job ID +* File name (for data ingested from files) + +The Operational Metadata feature of the Framework allows for the addition of metadata columns to all target tables generated by the pipelines you have defined in the framework. + +.. admonition:: Best Practice + :class: note + + You can define and add as many metadata columns as you require. However, it is recommended that you group your data into logically organized ``struct`` or ``map`` columns to avoid cluttering the table schema. + +Configuration +------------- + +| **Scope: Global** +| In the Framework bundle, operational metadata columns are defined in JSON configuration files at Lakehouse layer level (e.g. bronze, silver, gold). The configuration files are locate at and must be named as follows: ``src/config/operational_metadata_.json`` + +.. admonition:: Layer Config + :class: note + + * The layer suffix of the file needs to match one of the layers as defined in your pipeline / Data Flow Spec configurations e.g. bronze, silver, gold. + * The operational columns defined in the JSON configuration will be applied to all tables in the corresponding layer. + +Configuration Schema +-------------------- + +The operational metadata configuration file must follow the schema below: + +.. code-block:: json + + { + "type": "struct", + "fields": [ + { + "name": "", + "type": { + "type": "data_type", + "nullable": true/false, + "metadata": { + "mapping": { + "type": "", + "payload": "" + } + }, + } + }, + ... + ] + } + +.. list-table:: + :header-rows: 1 + + * - Field + - Description + * - **name** + - The name of the operational metadata column. + * - **type** + - The data_type of the column. For a list of valid data types please refer to the Databricks documentation: `https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html`_ + * - **nullable** + - A boolean that indicates whether the field can be null or not. + * - **metadata** + - A JSON object that contains the metadata for the column. + * - **metadata.mapping** + - A JSON object that contains the mapping configuration for the column. + * - **metadata.mapping.type** + - The type of mapping to be applied to the column. See below for supported types. + * - **metadata.mapping.payload** + - The SQL string to be applied to the column. + +**Mapping Types** + +.. list-table:: + :header-rows: 1 + + * - Field + - Description + - Payload + * - **sql** + - The column values will be derived by executing the SQL string provided in the payload. + - Any valid SQL function or expression, per the Databricks `SQL Language Reference `_. + * - **pipeline_detail** + - The name of any single ``pipeline_detail`` attribute. + - The following attributes are available: + + * **pipeline_id**: The pipeline id + * **pipeline_update_id**: The pipeline update id + * **pipeline_layer**: The medallion layer for the pipeline + * **start_utc_timestamp**: The start timestamp of the pipeline in UTC + * **pipeline_catalog**: The SDP target catalog + * **pipeline_schema**: The SDP target schema/database + * **workspace_env**: The workspace environment + * **logical_env**: The logical environment + +.. Note:: + For existing tables/pipelines, the operational metadata schema can be added or modified only if the table schema is not enforced and schema evolution is allowed (in this case, only new records will have the correct values in the added/modified fields). Otherwise the pipeline has to be fully refreshed to reprocess all data and apply operational metadata. + +Best Practice Configuration +--------------------------- + +It is recommended that you group your data into logically organized struct or map columns to avoid cluttering the table schema, as decribed by the below configuration schema: +The operational metadata configuration file must follow the schema below: + +.. code-block:: json + + { + "type": "struct", + "fields": [ + { + "name": "", + "type": { + "type": "struct", + "fields": [ + { + "name": "", + "type": { + "type": "data_type", + "nullable": true/false, + "metadata": { + "mapping": { + "type": "", + "payload": "" + } + }, + } + }, + ... + ] + }, + ... + }, + ... + ] + } + +Examples +-------- + +The below example illustrates the default configuration for a generic bronze and silver layer deployment. + +.. code-block:: json + + { + "type": "struct", + "fields": [ + { + "name": "meta_load_details", + "type": { + "type": "struct", + "fields": [ + { + "name": "record_insert_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": { + "mapping": { + "type": "sql", + "payload": "current_timestamp()" + } + } + }, + { + "name": "pipeline_update_id", + "type": "string", + "nullable": false, + "metadata": { + "mapping": { + "type": "pipeline_detail", + "payload": "pipeline_update_id" + } + } + } + ] + } + } + ] + } \ No newline at end of file diff --git a/docs/source/feature_python_dependency_management.rst b/docs/source/feature_python_dependency_management.rst new file mode 100644 index 0000000..763edea --- /dev/null +++ b/docs/source/feature_python_dependency_management.rst @@ -0,0 +1,239 @@ +Python Dependency Management +============================ + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Framework` :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Pipeline` + +Overview +-------- + +The Lakeflow Framework provides flexible Python dependency management at two levels: + +1. **Framework Level**: Global dependencies required by the framework or custom extensions (``requirements.txt``) +2. **Pipeline Bundle Level**: Bundle-specific dependencies configured via Databricks Asset Bundles + +This separation allows the framework to maintain its core dependencies independently while enabling pipeline developers to add custom packages for their specific use cases. + +.. important:: + + Databricks recommends using the **pipeline environment settings** to manage Python dependencies. + +Framework Dependencies +---------------------- + +The framework includes a ``requirements.txt`` file at the root of the repository that defines global dependencies required for the framework to function. + +Location +^^^^^^^^ + +:: + + dlt_framework/ + ├── requirements.txt # Framework dependencies + ├── requirements-dev.txt # Development dependencies (testing, docs, etc.) + └── src/ + └── ... + +Framework requirements.txt +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: text + :caption: requirements.txt + + ## requirements.txt: dependencies for runtime. + ## Core dependencies + jsonschema + + ## Add any additional dependencies needed for custom functionality below here + +.. note:: + + The framework's core dependencies are intentionally minimal. Add any additional dependencies needed for custom functionality below the core dependencies, do not change the core dependencies. + +Pipeline Bundle Dependencies +---------------------------- + +For pipeline-specific Python dependencies, Databricks recommends using the **pipeline environment** configuration in your Databricks Asset Bundle. For detailed information, see the official Databricks documentation: + +- `Manage Python dependencies for pipelines `_ +- `Databricks Asset Bundles - Pipeline Environment `_ + +Configuring Pipeline Environment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Add the ``environment`` section to your pipeline resource definition in your Databricks Asset Bundle: + +.. code-block:: yaml + :caption: resources/pipeline.yml + :emphasize-lines: 10-13 + + resources: + pipelines: + my_pipeline: + name: My Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + + environment: + dependencies: + - -r + ${workspace.file_path}/requirements.txt + + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + +Using a Requirements File +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The recommended approach is to reference a ``requirements.txt`` file in your pipeline bundle: + +**Step 1: Create a requirements.txt in your pipeline bundle** + +For example: +.. code-block:: text + :caption: my_pipeline_bundle/requirements.txt + requests>=2.28.0 + openpyxl + +**Step 2: Reference it in your pipeline environment** + +.. code-block:: yaml + + environment: + dependencies: + - -r + ${workspace.file_path}/requirements.txt + +.. important:: + + The ``-r`` flag tells pip to read requirements from a file. The path ``${workspace.file_path}`` is substituted with the deployed bundle location in the Databricks workspace. + +Inline Dependencies +^^^^^^^^^^^^^^^^^^^ + +For simple cases with few dependencies, you can specify packages inline: + +.. code-block:: yaml + + environment: + dependencies: + - requests>=2.28.0 + - pandas>=2.0.0 + +Installing from Unity Catalog Volumes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can also install Python wheel packages stored in Unity Catalog volumes: + +.. code-block:: yaml + + environment: + dependencies: + - /Volumes/my_catalog/my_schema/my_volume/my_package-1.0-py3-none-any.whl + +Best Practices +-------------- + +Version Pinning +^^^^^^^^^^^^^^^ + +Always pin dependency versions to ensure reproducible builds: + +.. code-block:: text + + # Recommended: Pin to minimum version + requests>=2.28.0 + + # For strict reproducibility + pandas==2.0.3 + + # Avoid: Unpinned versions + requests # Not recommended + +Documentation +^^^^^^^^^^^^^ + +Add comments to explain why each dependency is needed: + +.. code-block:: text + + # HTTP client for external API integrations + requests>=2.28.0 + + # JSON schema validation for custom specs + jsonschema>=4.0.0 + + # Date parsing utilities for transform functions + python-dateutil>=2.8.0 + +Testing Dependencies Locally +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Before deploying, test that dependencies install correctly: + +.. code-block:: bash + + # Create a virtual environment + python -m venv test_env + source test_env/bin/activate + + # Install dependencies + pip install -r requirements.txt + + # Verify imports work + python -c "import requests; import pandas; print('Success!')" + +Limitations +----------- + +1. **JVM Libraries Not Supported**: Lakeflow Declarative Pipelines only support SQL and Python. JVM libraries (Scala/Java) cannot be used and may cause unpredictable behavior. + +2. **Startup Time Impact**: Each additional dependency increases pipeline startup time. Keep dependencies minimal for faster pipeline starts. + +3. **No Hot Reloading**: Dependencies are installed at pipeline startup. Adding new dependencies requires a pipeline restart. + +4. **Cluster-Wide Scope**: Dependencies are installed for the entire pipeline cluster. Be mindful of potential conflicts between packages. + +Troubleshooting +--------------- + +Dependencies Not Found +^^^^^^^^^^^^^^^^^^^^^^ + +If packages aren't being installed: + +1. Verify the ``environment`` section is correctly indented in your YAML +2. Check that the path to ``requirements.txt`` is correct +3. Ensure the requirements file is included in your bundle deployment + +.. code-block:: yaml + + # Verify correct path substitution + environment: + dependencies: + - -r + ${workspace.file_path}/requirements.txt # Points to bundle root + +Version Conflicts +^^^^^^^^^^^^^^^^^ + +If you encounter version conflicts: + +1. Check for conflicting versions between framework and bundle requirements +2. Use ``pip check`` locally to identify conflicts +3. Consider pinning specific versions to resolve conflicts + +.. code-block:: bash + + pip install -r requirements.txt + pip check # Shows any dependency conflicts + + diff --git a/docs/source/feature_python_extensions.rst b/docs/source/feature_python_extensions.rst new file mode 100644 index 0000000..e3359c9 --- /dev/null +++ b/docs/source/feature_python_extensions.rst @@ -0,0 +1,379 @@ +Python Extensions +================= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Pipeline` + +Overview +-------- +Python Extensions allow data engineers to write custom Python modules that extend the framework's capabilities. Extensions are organized in a central ``extensions/`` directory and can be imported as standard Python modules throughout your dataflow specifications. + +.. important:: + + Extensions provide a powerful mechanism for implementing custom logic—sources, transforms, and sinks—while maintaining clean separation between framework code and business logic. + +This feature allows development teams to: + +- **Centralize Custom Logic**: Organize all custom Python code in one location +- **Reuse Across Dataflows**: Reference the same functions from multiple dataflow specs +- **Maintain Clean Imports**: Use standard Python module imports (e.g., ``transforms.my_function``) +- **Manage Dependencies**: Install additional Python packages via ``requirements_additional.txt`` +- **Test Independently**: Extensions can be unit tested outside of Spark Declarative Pipelines + +.. note:: + + Extensions are loaded during pipeline initialization when the framework adds the ``extensions/`` directory to the Python path. Any additional dependencies specified in ``requirements_additional.txt`` are installed before the pipeline starts. + +How It Works +------------ + +The extension system consists of three main components: + +1. **Extensions Directory**: A ``src/extensions/`` folder in your pipeline bundle containing Python modules +2. **Module References**: Dataflow specs reference extension functions using ``module`` syntax (e.g., ``transforms.my_function``) +3. **Dependency Management**: Optional ``requirements_additional.txt`` files for installing pip packages + +Directory Structure +^^^^^^^^^^^^^^^^^^^ + +Extensions live in the ``src/extensions/`` directory of your pipeline bundle: + +:: + + my_pipeline_bundle/ + ├── src/ + │ ├── extensions/ + │ │ ├── __init__.py # Optional, for package imports + │ │ ├── sources.py # Custom source functions + │ │ ├── transforms.py # Custom transform functions + │ │ └── sinks.py # Custom sink functions + │ ├── dataflows/ + │ │ └── ... + │ └── pipeline_configs/ + │ └── ... + └── requirements_additional.txt # Optional pip dependencies + +Dependency Management +--------------------- + +Extensions may require additional Python packages beyond the framework's core dependencies. For detailed information on managing Python dependencies, see :doc:`feature_python_dependency_management`. + +Extension Examples +------------------ + +Source Extensions +^^^^^^^^^^^^^^^^^ + +Custom functions that generate DataFrames for use as data sources. + + +.. code-block:: python + :caption: src/extensions/sources.py + + from pyspark.sql import DataFrame, SparkSession + from pyspark.sql import functions as F + from typing import Dict + + def get_customer_cdf(spark: SparkSession, tokens: Dict) -> DataFrame: + """ + Get customer data with Change Data Feed enabled. + """ + source_table = tokens["sourceTable"] + reader_options = {"readChangeFeed": "true"} + + return ( + spark.readStream + .options(**reader_options) + .table(source_table) + ) + + def get_api_data(spark: SparkSession, tokens: Dict) -> DataFrame: + """ + Fetch data from an external API. + """ + import requests # From requirements_additional.txt + + api_url = tokens["apiUrl"] + response = requests.get(api_url) + data = response.json() + + return spark.createDataFrame(data) + +**Reference in Dataflow Spec:** + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 12 + + { + "dataFlowId": "customer_from_extension", + "dataFlowGroup": "my_dataflows", + "dataFlowType": "standard", + "sourceSystem": "custom", + "sourceType": "python", + "sourceViewName": "v_customer", + "sourceDetails": { + "tokens": { + "sourceTable": "{staging_schema}.customer" + }, + "pythonModule": "sources.get_customer_cdf" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer" + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 10 + + dataFlowId: customer_from_extension + dataFlowGroup: my_dataflows + dataFlowType: standard + sourceSystem: custom + sourceType: python + sourceViewName: v_customer + sourceDetails: + tokens: + sourceTable: '{staging_schema}.customer' + pythonModule: sources.get_customer_cdf + mode: stream + targetFormat: delta + targetDetails: + table: customer + +Transform Extensions +^^^^^^^^^^^^^^^^^^^^ + +Custom functions that transform DataFrames after they are read from a source. + +**Function Signatures:** + +.. code-block:: python + + # Without tokens + def my_transform(df: DataFrame) -> DataFrame: + ... + + # With tokens + def my_transform_with_tokens(df: DataFrame, tokens: Dict) -> DataFrame: + ... + +**Example:** + +.. code-block:: python + :caption: src/extensions/transforms.py + + from pyspark.sql import DataFrame + from pyspark.sql import functions as F + from typing import Dict + + from pyspark.sql import DataFrame + from pyspark.sql import functions as F + + def explode_deletes_function_transform(df: DataFrame) -> DataFrame: + """ + Duplicates delete records and adjusts sequence_by timestamp. + For deletes: is_delete=0 gets +1ms, is_delete=1 gets +2ms. + """ + # Create array: [0,1] for deletes, [0] for others, then explode + sequence_column = "LOAD_TIMESTAMP" + change_type_column = "meta_cdc_operation" + + is_delete = F.col(change_type_column) == "delete" + array_col = F.when(is_delete, F.array(F.lit(0), F.lit(1))).otherwise(F.array(F.lit(0))) + + return ( + df.withColumnRenamed("_change_type", change_type_column) + .withColumn("is_delete", F.explode(array_col)) + .withColumn( + sequence_column, + F.when(is_delete & (F.col("is_delete") == 0), + F.col(sequence_column) + F.expr("INTERVAL 1 millisecond")) + .when(is_delete & (F.col("is_delete") == 1), + F.col(sequence_column) + F.expr("INTERVAL 2 millisecond")) + .otherwise(F.col(sequence_column)) + ) + ) + +**Reference in Dataflow Spec:** + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 12-14 + + { + "dataFlowId": "customer", + "dataFlowGroup": "my_dataflows", + "dataFlowType": "standard", + "sourceSystem": "erp", + "sourceType": "delta", + "sourceViewName": "v_customer", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true, + "pythonTransform": { + "module": "transforms.explode_deletes_function_transform", + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer" + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 11-12 + + dataFlowId: customer + dataFlowGroup: my_dataflows + dataFlowType: standard + sourceSystem: erp + sourceType: delta + sourceViewName: v_customer + sourceDetails: + database: '{bronze_schema}' + table: customer + cdfEnabled: true + pythonTransform: + module: transforms.explode_deletes_function_transform + mode: stream + targetFormat: delta + targetDetails: + table: customer_aggregated + +Sink Extensions +^^^^^^^^^^^^^^^ + +Custom functions for ``foreach_batch_sink`` targets that process micro-batches. + +**Function Signature:** + +.. code-block:: python + + def my_batch_handler(df: DataFrame, batch_id: int, tokens: Dict) -> None: + """ + Process a micro-batch of data. + + Args: + df: The micro-batch DataFrame + batch_id: The batch identifier + tokens: Dictionary of token values from the dataflow spec + """ + ... + +**Example:** + +.. code-block:: python + :caption: src/extensions/sinks.py + + from pyspark.sql import DataFrame + from typing import Dict + + def write_to_external_api(df: DataFrame, batch_id: int, tokens: Dict) -> None: + """ + Send each batch to an external API. + """ + import requests # From requirements_additional.txt + + api_url = tokens["apiUrl"] + api_key = tokens["apiKey"] + + # Convert to JSON and send + records = df.toJSON().collect() + for record in records: + requests.post( + api_url, + headers={"Authorization": f"Bearer {api_key}"}, + json=record + ) + +**Reference in Dataflow Spec:** + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 19 + + { + "dataFlowId": "customer_to_api", + "dataFlowGroup": "my_dataflows", + "dataFlowType": "standard", + "sourceSystem": "erp", + "sourceType": "delta", + "sourceViewName": "v_customer_api", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "foreach_batch_sink", + "targetDetails": { + "name": "customer_api_sink", + "type": "python_function", + "config": { + "module": "sinks.write_to_external_api", + "tokens": { + "apiUrl": "https://api.example.com/customers", + "apiKey": "{api_secret_key}" + } + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 17 + + dataFlowId: customer_to_api + dataFlowGroup: my_dataflows + dataFlowType: standard + sourceSystem: erp + sourceType: delta + sourceViewName: v_customer_api + sourceDetails: + database: '{silver_schema}' + table: customer + cdfEnabled: true + mode: stream + targetFormat: foreach_batch_sink + targetDetails: + name: customer_api_sink + type: python_function + config: + module: sinks.write_to_external_api + tokens: + apiUrl: https://api.example.com/customers + apiKey: '{api_secret_key}' + +Additional Resources +-------------------- + +- :doc:`feature_python_dependency_management` - Managing Python dependencies +- :doc:`feature_python_source` - Using Python as a source type +- :doc:`feature_python_functions` - Python transform functions (file path approach) +- :doc:`dataflow_spec_ref_source_details` - Complete source configuration reference +- :doc:`dataflow_spec_ref_target_details` - Complete target configuration reference + diff --git a/docs/source/feature_python_functions.rst b/docs/source/feature_python_functions.rst new file mode 100644 index 0000000..8acc36f --- /dev/null +++ b/docs/source/feature_python_functions.rst @@ -0,0 +1,287 @@ +Python Function Transforms +=============================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Pipeline` + * - **Databricks Docs:** + - NA + +Overview +-------- +You can specify custom Python functions or transforms in your Pipeline Bundle and then reference these in your data flow specs. +These allow for flexibility and more complex transformations to be supported without overly complicating the Framework. + +The functions get called and executed by the framework directly after a View reads from its source. + +There are two approaches to defining Python transforms: + +1. **Extensions**: Define functions in the ``src/dataflows/extensions/`` directory and reference them by module name +2. **File Path**: Define functions in ``./python_functions/`` directories and reference by file path + +Sample Bundle +------------- + +Samples are available in the ``bronze_sample`` bundle in the ``src/dataflows/feature_samples`` folder. + +Configuration +------------- + +Using Extensions +~~~~~~~~~~~~~~~~ + +The extensions approach allows you to organize your Python functions in a central location and import them as standard Python modules. + +**1. Create an Extension Module** + +Create your transform functions in the ``extensions/`` directory at the bundle root: + +:: + + my_pipeline_bundle/ + ├── src/ + │ ├── extensions/ + │ │ └── transforms.py # Your transform functions + │ ├── dataflows/ + │ │ └── ... + +Your extension module can contain multiple functions: + +.. code-block:: python + + # src/extensions/transforms.py + from pyspark.sql import DataFrame + from pyspark.sql import functions as F + from typing import Dict + + def customer_aggregation(df: DataFrame) -> DataFrame: + """ + Apply customer aggregation transformation. + """ + return ( + df.withWatermark("load_timestamp", "10 minutes") + .groupBy("CUSTOMER_ID") + .agg(F.count("*").alias("COUNT")) + ) + + def customer_aggregation_with_tokens(df: DataFrame, tokens: Dict) -> DataFrame: + """ + Apply aggregation with configurable parameters from tokens. + """ + watermark_column = tokens.get("watermarkColumn", "load_timestamp") + watermark_delay = tokens.get("watermarkDelay", "10 minutes") + group_by_column = tokens.get("groupByColumn", "CUSTOMER_ID") + + return ( + df.withWatermark(watermark_column, watermark_delay) + .groupBy(group_by_column) + .agg(F.count("*").alias("COUNT")) + ) + +**2. Reference in Data Flow Spec** + +Use ``pythonTransform.module`` to reference your function: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 12-14 + + { + "dataFlowId": "feature_python_extension_transform", + "dataFlowGroup": "feature_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_feature_python_extension_transform", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true, + "pythonTransform": { + "module": "transforms.customer_aggregation" + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_extension_transform", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 11-12 + + dataFlowId: feature_python_extension_transform + dataFlowGroup: feature_samples + dataFlowType: standard + sourceSystem: testSystem + sourceType: delta + sourceViewName: v_feature_python_extension_transform + sourceDetails: + database: '{staging_schema}' + table: customer + cdfEnabled: true + pythonTransform: + module: transforms.customer_aggregation + mode: stream + targetFormat: delta + targetDetails: + table: feature_python_extension_transform + tableProperties: + delta.enableChangeDataFeed: 'true' + +**Using Tokens with Extensions** + +You can pass configuration tokens to your transform function: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + "pythonTransform": { + "module": "transforms.customer_aggregation_with_tokens", + "tokens": { + "watermarkColumn": "event_timestamp", + "watermarkDelay": "5 minutes", + "groupByColumn": "ORDER_ID" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + pythonTransform: + module: transforms.customer_aggregation_with_tokens + tokens: + watermarkColumn: event_timestamp + watermarkDelay: 5 minutes + groupByColumn: ORDER_ID + + +Using File Path +~~~~~~~~~~~~~~~ + +To define a python function using file paths, create a ``python_functions`` folder under the base folder for your dataflowspec: + +:: + + my_pipeline_bundle/ + ├── src/ + │ ├── dataflows/ + │ │ ├── use_case_1/ + │ │ │ ├── dataflowspec/ + │ │ │ │ └── my_data_flow_spec_main.json + │ │ │ ├── python_functions/ + │ │ │ │ └── my_function.py + │ │ │ └── schemas/ + +Your file must contain a function called ``apply_transform`` that: + +* Takes a DataFrame as the first parameter (and optionally tokens as the second) +* Returns a DataFrame + +.. code-block:: python + + from pyspark.sql import DataFrame + from pyspark.sql import functions as F + + def apply_transform(df: DataFrame, tokens: Dict) -> DataFrame: + """ + Apply a transformation to the DataFrame. + """ + return ( + df.withWatermark("load_timestamp", "1 minute") + .groupBy("CUSTOMER_ID") + .agg(F.count("*").alias("COUNT")) + ) + +**Reference using pythonTransform.functionPath:** + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 12-14 + + { + "dataFlowId": "feature_python_function_transform", + "dataFlowGroup": "feature_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_feature_python_function_transform", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true, + "pythonTransform": { + "functionPath": "my_function.py" + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_function_transform" + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 11-12 + + dataFlowId: feature_python_function_transform + dataFlowGroup: feature_samples + dataFlowType: standard + sourceSystem: testSystem + sourceType: delta + sourceViewName: v_feature_python_function_transform + sourceDetails: + database: '{staging_schema}' + table: customer + cdfEnabled: true + pythonTransform: + functionPath: my_function.py + mode: stream + targetFormat: delta + targetDetails: + table: feature_python_function_transform + +pythonTransform Schema +---------------------- + +The ``pythonTransform`` object supports the following properties: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 65 + + * - Property + - Required + - Description + * - ``module`` + - One of module/functionPath + - Module and function reference (e.g., ``transforms.customer_aggregation``). The module must be in the ``src/dataflows/extensions/`` directory. + * - ``functionPath`` + - One of module/functionPath + - Path to a Python file containing an ``apply_transform`` function. Resolved relative to the ``./python_functions/`` directory. + * - ``tokens`` + - No + - Dictionary of token values to pass to the transform function. The function signature must accept ``tokens`` as a second parameter. + diff --git a/docs/source/feature_python_source.rst b/docs/source/feature_python_source.rst new file mode 100644 index 0000000..64a9003 --- /dev/null +++ b/docs/source/feature_python_source.rst @@ -0,0 +1,272 @@ +Python Source +============= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Data Flow Spec` + * - **Databricks Docs:** + - NA + +Overview +-------- +You can specify a Python function as a source type in your Data Flow Specs. These allow for flexibility and more complex data retrieval to be +supported, as needed, without overly complicating the Framework. + +There are two approaches to defining Python sources: + +1. **Extensions**: Define functions in the ``src/extensions/`` directory and reference them by module name using ``pythonModule`` +2. **File Path**: Define functions in ``./python_functions/`` directories and reference by file path using ``functionPath`` + +Sample Bundle +------------- + +Samples are available in the ``bronze_sample`` bundle in the ``src/dataflows/feature_samples`` folder. + +Configuration +------------- + +Using Extensions +~~~~~~~~~~~~~~~~ + +The extensions approach allows you to organize your Python source functions in a central location and import them as standard Python modules. + +**1. Create an Extension Module** + +Create your source functions in the ``extensions/`` directory at the bundle root: + +:: + + my_pipeline_bundle/ + ├── src/ + │ ├── extensions/ + │ │ └── sources.py # Your source functions + │ ├── dataflows/ + │ │ └── ... + +Your extension module can contain multiple functions. Each function must: + +* Accept ``spark`` (SparkSession) and ``tokens`` (Dict) as parameters +* Return a DataFrame + +.. code-block:: python + + # src/extensions/sources.py + from pyspark.sql import DataFrame, SparkSession + from pyspark.sql import functions as F + from typing import Dict + + def get_customer_cdf(spark: SparkSession, tokens: Dict) -> DataFrame: + """ + Get customer data with Change Data Feed enabled. + """ + source_table = tokens["sourceTable"] + reader_options = { + "readChangeFeed": "true" + } + + df = spark.readStream.options(**reader_options).table(source_table) + return df.withColumn("TEST_COLUMN", F.lit("testing from extension...")) + + def get_orders_batch(spark: SparkSession, tokens: Dict) -> DataFrame: + """ + Get orders data as a batch read. + """ + source_table = tokens["sourceTable"] + return spark.read.table(source_table) + +**2. Reference in Data Flow Spec** + +Use ``pythonModule`` in ``sourceDetails`` to reference your function: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 6,12 + + { + "dataFlowId": "feature_python_extension_source", + "dataFlowGroup": "feature_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "python", + "sourceViewName": "v_feature_python_extension_source", + "sourceDetails": { + "tokens": { + "sourceTable": "{staging_schema}.customer" + }, + "pythonModule": "sources.get_customer_cdf" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_extension_source", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 6,11 + + dataFlowId: feature_python_extension_source + dataFlowGroup: feature_samples + dataFlowType: standard + sourceSystem: testSystem + sourceType: python + sourceViewName: v_feature_python_extension_source + sourceDetails: + tokens: + sourceTable: '{staging_schema}.customer' + pythonModule: sources.get_customer_cdf + mode: stream + targetFormat: delta + targetDetails: + table: feature_python_extension_source + tableProperties: + delta.enableChangeDataFeed: 'true' + +Using File Path +~~~~~~~~~~~~~~~ + +To define a python source function using file paths, create a ``python_functions`` folder under the base folder for your dataflowspec: + +:: + + my_pipeline_bundle/ + ├── src/ + │ ├── dataflows/ + │ │ ├── use_case_1/ + │ │ │ ├── dataflowspec/ + │ │ │ │ └── my_data_flow_spec_main.json + │ │ │ ├── python_functions/ + │ │ │ │ └── my_source_function.py + │ │ │ └── schemas/ + +Your file must contain a function called ``get_df`` that: + +* Accepts ``spark`` (SparkSession) and ``tokens`` (Dict) as parameters +* Returns a DataFrame + +.. code-block:: python + + from pyspark.sql import DataFrame, SparkSession + from pyspark.sql import functions as F + from typing import Dict + + def get_df(spark: SparkSession, tokens: Dict) -> DataFrame: + """ + Get a DataFrame from the source details with applied transformations. + """ + source_table = tokens["sourceTable"] + reader_options = { + "readChangeFeed": "true" + } + + df = spark.readStream.options(**reader_options).table(source_table) + return df.withColumn("TEST_COLUMN", F.lit("testing...")) + +**Reference using functionPath:** + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 6,12 + + { + "dataFlowId": "feature_python_function_source", + "dataFlowGroup": "feature_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "python", + "sourceViewName": "v_feature_python_function_source", + "sourceDetails": { + "tokens": { + "sourceTable": "{staging_schema}.customer" + }, + "functionPath": "my_source_function.py" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_function_source" + } + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 6,11 + + dataFlowId: feature_python_function_source + dataFlowGroup: feature_samples + dataFlowType: standard + sourceSystem: testSystem + sourceType: python + sourceViewName: v_feature_python_function_source + sourceDetails: + tokens: + sourceTable: '{staging_schema}.customer' + functionPath: my_source_function.py + mode: stream + targetFormat: delta + targetDetails: + table: feature_python_function_source + +sourceDetails Schema for Python Source +-------------------------------------- + +When using ``sourceType: "python"``, the ``sourceDetails`` object supports the following properties: + +.. list-table:: + :header-rows: 1 + :widths: 20 15 65 + + * - Property + - Required + - Description + * - ``pythonModule`` + - One of pythonModule/functionPath + - Module and function reference (e.g., ``sources.get_customer_cdf``). The module must be in the ``src/extensions/`` directory. + * - ``functionPath`` + - One of pythonModule/functionPath + - Path to a Python file containing a ``get_df`` function. Resolved relative to the ``./python_functions/`` directory. + * - ``tokens`` + - No + - Dictionary of token values to pass to the source function. Supports substitution variables like ``{staging_schema}``. + +Function Signatures +------------------- + +**For Extensions (pythonModule)** + +The function name can be anything, but it must accept ``spark`` and ``tokens``: + +.. code-block:: python + + def my_source_function(spark: SparkSession, tokens: Dict) -> DataFrame: + ... + +**For File Path (functionPath)** + +The function must be named ``get_df``: + +.. code-block:: python + + def get_df(spark: SparkSession, tokens: Dict) -> DataFrame: + ... + +Additional Resources +-------------------- + +Refer to the :doc:`dataflow_spec_ref_source_details` section of the :doc:`dataflow_spec_reference` documentation for more information on source configuration. diff --git a/docs/source/feature_schemas.rst b/docs/source/feature_schemas.rst new file mode 100644 index 0000000..ab851e2 --- /dev/null +++ b/docs/source/feature_schemas.rst @@ -0,0 +1,199 @@ +Schemas +======= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - NA + +The Framework supports the defintiion of schemas in the following ways: + +* Schema on Read: + * A schema can be specified on most sources using JSON StructType format +* Schema on Write: + * A schema can be specified for Staging or Target tables using JSON StructType format or text DDL format. + +Schema Types +------------ + +.. list-table:: + :header-rows: 0 + + * - **Type** + - **Description** + - **Supports** + * - JSON StructType + - Allows you to specify the schema as a StructType in JSON format. + - - Can be used for both Schema on Read and Schema on Write. + - Can be used to define columns only. + * - Text DDL + - Allows you to specify the schema as a text DDL format. + - - Can only be used to specify the schemas for your staging or target tables. + - Feature Support: + + - `Constraints `_ + - `Generated Columns `_ + - `Column Masking Functions `_ + +Schema File Location +-------------------- + +Schemas must be specified in their own dedicated files and will be locatated in a schemas folder, dependant on your chosen bundle structure as dicussed in the :doc:`build_pipeline_bundle_structure` section. + +Data Flow Spec Configuration +--------------------------- + +Schema files are then referenced in the Data Flow Spec configuration for the source, staging table or target table they apply to. Refer to the :doc:`data_flow_spec` section for more information. + + +StructType JSON Format +---------------------- + +* Can be used for both Schema on Read and Schema on Write. +* File name: ``.json``, the file MUST have a ``.json`` extension. +* Documentation: + + * PySpark StructType documentation: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.types.StructType.html + * Databricks Data Types documentation: https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html + +Generating the Schema Definition +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**PySpark:** + +If you have your data in Databricks, you can can read your source into a dataframe and then use the following code to generate the JSON schema format: + +.. code-block:: python + + df.schema.jsonValue() + +**LLM:** + +In Perplexity or ChatGPT the following prompt will generate the JSON schema format: + +Prompt:: + + Convert the following schema definition into the equivalent Databricks StructType JSON format. The output should be a valid JSON object representing the schema, including all field names, data types, nullability, and nested structures where applicable. Do not include any explanatory text—just the JSON output. + + Input schema: + [Insert schema definition here] + + +For Example: +~~~~~~~~~~~~ + +.. code-block:: json + :emphasize-lines: 1-3, 40-41 + + { + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] + } + +.. important:: + + The highlighted wrapping struct fields declaration is mandatory. + + +Text DDL Format +--------------- + +* Can only be used to specify the schemas for your staging or target tables. +* Feature support: + * Constraints - https://docs.databricks.com/aws/en/tables/constraints + * Generated Columns - https://docs.databricks.com/aws/en/delta/generated-columns + * Column Masking Functions - https://docs.databricks.com/aws/en/dlt/unity-catalog#row-filters-and-column-masks +* File name: ``.ddl``, the file MUST have a ``.ddl`` extension. +* Documentation: + + * CREATE TABLE Documentation: https://docs.databricks.com/gcp/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using + +.. admonition:: DDL Format Rules + :class: note + + * Each column must be defined on a new line + * Use ``--`` to comment out columns (they will be removed from the schema) + * Column names and data types must be valid according to Databricks SQL specifications + +Examples +~~~~~~~~ + +Basic Schema Definition: + +.. code-block:: text + + CUSTOMER_ID integer NOT NULL, + FIRST_NAME string, + LAST_NAME string, + EMAIL string, + DELETE_FLAG boolean, + LOAD_TIMESTAMP timestamp + +Schema Definition with Constraint and Generated Column: + +.. code-block:: text + :emphasize-lines: 7, 8 + + CUSTOMER_ID integer NOT NULL, + FIRST_NAME string, + LAST_NAME string, + EMAIL string, + DELETE_FLAG boolean, + LOAD_TIMESTAMP timestamp, + LOAD_YEAR int GENERATED ALWAYS AS (YEAR(LOAD_TIMESTAMP)), + CONSTRAINT pk_customer PRIMARY KEY(CUSTOMER_ID) + +Schema Definition with Comments: + +.. code-block:: text + :emphasize-lines: 7 + + CUSTOMER_ID integer NOT NULL, + FIRST_NAME string, + LAST_NAME string, + EMAIL string, + DELETE_FLAG boolean, + LOAD_TIMESTAMP timestamp, + -- CONSTRAINT pk_customer PRIMARY KEY(CUSTOMER_ID) diff --git a/docs/source/feature_secrets.rst b/docs/source/feature_secrets.rst new file mode 100644 index 0000000..ff60ba4 --- /dev/null +++ b/docs/source/feature_secrets.rst @@ -0,0 +1,205 @@ +Secrets Management +================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Pipeline` + * - **Databricks Docs:** + - https://docs.databricks.com/en/security/secrets + +Overview +-------- +Databricks natively supports secrets management and the secure retrieval of credentials, connection details, host names or +other sensitive information for use at pipeline execution time. This negates the need to include these details directly in your data flow specs, config files or code. + +.. important:: + + Credentials and other sensitive information should never be hardcoded in your data flow specs, config files or code. + Use the native Databricks secrets management features to store and retrieve these details. + +The Framework allows you to specify the Secret Scopes and Secrets you need access to at Pipeline Bundle level and then reference these in your data flow specs. + +.. important:: + + Secret management is implemented in a such a way that: + + - Secrets are not cached by the Framework + - Secrets appear as `[REDACTED]` in the Framework logs or any print statements + - Secrets are retrieved only at pipeline execution time + - Secrets appear as `[REDACTED]` in any downstream conversions + +.. warning:: + + DO NOT CHANGE THE SECRET MANAGER IMPLEMENTATION WITHOUT TALKING TO YOUR FRAMEWORK OWNER FIRST! + +Configuration +------------ + +| **Scope: Pipeline** +| In a Pipeline bundle, secrets are defined in the following configuration file: ``src/pipeline_config/_secrets.json|yaml`` +| e.g. ``src/pipeline_config/dev_secrets.json|yaml`` + +Configuration Schema +-------------------- +An secrets config file has the following structure: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "": { + "scope": "", + "key": "" + }, + "": { + "scope": "", + "key": "" + }, + ... + } + + .. tab:: YAML + + .. code-block:: yaml + + : + scope: + key: + : + scope: + key: + ... + +.. list-table:: + :header-rows: 1 + + * - Field + - Description + * - **secret alias** + - The alias used to reference the secret in your data flow specs + * - **scope** + - The Secret Scope that the secret belongs to in Databricks + * - **key** + - The key of the secret in the specified Secret Scope. + +Referencing Secrets in Data Flow Specs +------------------------------------- +Secrets can be referenced as a value in any part of your data flow specs by using the folowing syntax: ``${secret.}``. + +For example, assume we want to connect to Kafka and we need to provide a keystore password. We would first ensure that the secret is configued in the secrets config file discussed above as follows: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "kafka_source_bootstrap_servers_password": { + "scope": "mySecretScope", + "key": "KafkaSecretKey" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + kafka_source_bootstrap_servers_password: + scope: mySecretScope + key: KafkaSecretKey + +We can then reference the secret in any data flow spec as per the highligheted line in the code sample below: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + :emphasize-lines: 12 + + { + "dataFlowId": "kafka_source_topic_1_staging", + "dataFlowGroup": "kafka_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "kafka", + "sourceViewName": "v_topic_1", + "sourceDetails": { + "readerOptions": { + "kafka.bootstrap.servers": "{kafka_source_bootstrap_servers}", + "kafka.security.protocol": "SSL", + "kafka.ssl.keystore.password": "${secret.kafka_source_bootstrap_servers_password}", + "subscribe": "{kafka_source_topic}", + "startingOffsets": "earliest" + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "topic_1_staging", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "off" + } + + .. tab:: YAML + + .. code-block:: yaml + :emphasize-lines: 11 + + dataFlowId: kafka_source_topic_1_staging + dataFlowGroup: kafka_samples + dataFlowType: standard + sourceSystem: testSystem + sourceType: kafka + sourceViewName: v_topic_1 + sourceDetails: + readerOptions: + kafka.bootstrap.servers: '{kafka_source_bootstrap_servers}' + kafka.security.protocol: SSL + kafka.ssl.keystore.password: ${secret.kafka_source_bootstrap_servers_password} + subscribe: '{kafka_source_topic}' + startingOffsets: earliest + mode: stream + targetFormat: delta + targetDetails: + table: topic_1_staging + tableProperties: + delta.enableChangeDataFeed: 'true' + dataQualityExpectationsEnabled: false + quarantineMode: 'off' + +Best Practices +------------- +1. Never store secrets in code or configuration files +2. Use appropriate secret scopes for different environments +3. Rotate secrets regularly +4. Limit access to secret scopes + +Troubleshooting +--------------- + +Secret Access Denied +^^^^^^^^^^^^^^^^^^^^ +- Verify secret scope exists +- Check access permissions +- Validate key names +- Review scope configuration + +Configuration Errors +^^^^^^^^^^^^^^^^^^^^ +- Validate config file format +- Check for missing fields +- Verify string values +- Review scope/key names \ No newline at end of file diff --git a/docs/source/feature_soft_deletes.rst b/docs/source/feature_soft_deletes.rst new file mode 100644 index 0000000..4eca6ad --- /dev/null +++ b/docs/source/feature_soft_deletes.rst @@ -0,0 +1,26 @@ +Soft Deletes +============ + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta-live-tables/python-ref.html#change-data-capture-from-a-change-feed-with-python-in-delta-live-tables + +The Lakeflow Framework provides support for soft deletes when processing change data capture (CDC) events using the ``apply_changes()`` function. This allows you to handle DELETE operations in your CDC data while maintaining a history of deleted records. + +When soft deletes are configured, deleted rows are temporarily retained as tombstones in the underlying Delta table. A view is created in the metastore that filters out these tombstones, providing a "live" view of current records. + +Configuration +------------- + +Soft deletes are configured using the ``apply_as_deletes`` parameter in the ``apply_changes()`` function. This parameter specifies the condition that indicates when a CDC event should be treated as a DELETE rather than an upsert. + +The condition can be specified either as: + +- A string expression: ``"Operation = 'DELETE'"`` +- A Spark SQL expression using ``expr()``: ``expr("Operation = 'DELETE'")`` \ No newline at end of file diff --git a/docs/source/feature_source_types.rst b/docs/source/feature_source_types.rst new file mode 100644 index 0000000..6da9bc8 --- /dev/null +++ b/docs/source/feature_source_types.rst @@ -0,0 +1,87 @@ +Supported Source Types +====================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta-live-tables/python-ref.html + +The Lakeflow Framework supports multiple source types. Each source type provides specific configuration +options to handle different data ingestion scenarios. + +Source Types +------------ + +.. list-table:: + :header-rows: 1 + :widths: 20 100 100 + + * - **Type** + - **Description** + - **Key Features** + * - **Batch Files** + - Reads data from UC Volumes orcloud storage locations (e.g., S3, ADLS, GCS). Supports various file formats and provides options for filtering and transforming data during ingestion. + - - Flexible path-based file access + - Reader options for different file formats + - Optional select expressions and where clauses + - Schema on read support + * - **Cloud Files** + - Reads data from UC Volumes or cloud storage locations (e.g., S3, ADLS, GCS). Supports various file formats and provides options for filtering and transforming data during ingestion. + - - Flexible path-based file access + - Reader options for different file formats + - Optional select expressions and where clauses + - Schema on read support + * - **Delta** + - Connects to existing Delta tables in the metastore, supporting both batch and streaming reads with change data feed (CDF) capabilities. + - - Database and table-based access + - Change Data Feed (CDF) support + - Optional path-based access + - Configurable reader options + * - **Delta Join** + - Enables joining multiple Delta tables, supporting both streaming and static join patterns. + - - Multiple source table configuration + - Stream and static join modes + - Left and inner join support + - Flexible join conditions + - Per-source CDF configuration + * - **Kafka** + - Enables reading from Apache Kafka topics for real-time streaming data processing. + - - Kafka-specific reader options + - Schema definition support + - Filtering and transformation support + - Topic-based configurations + - Demux and fan-out support + * - **Python** + - Allows using a Python function as a data source, providing flexibility for complex data transformations. + - - Python file-based configuration + - Functions stored in `python_functions` subdirectory + - Full Python / Pyspark capabilities + - Detailed configuration details: :doc:`feature_python_source` + * - **SQL** + - Allows using SQL queries as data sources, providing flexibility for complex data transformations. + - - SQL file-based configuration + - Queries stored in `dml` subdirectory + - Full SQL transformation capabilities + - Detailed configuration details: :doc:`feature_sql_source` + +General Data Flow Spec Configuration +------------------------------------ + +Set as an attribute when creating your Data Flow Spec, refer to the :doc:`dataflow_spec_reference` documentation for more information: + +* :doc:`dataflow_spec_ref_source_details` +* :doc:`dataflow_spec_ref_target_details` + +Detailed Source Type Configuration Details +------------------------------------------ + +.. toctree:: + :maxdepth: 1 + + feature_python_source + feature_sql_source diff --git a/docs/source/feature_spark_configuration.rst b/docs/source/feature_spark_configuration.rst new file mode 100644 index 0000000..df99a08 --- /dev/null +++ b/docs/source/feature_spark_configuration.rst @@ -0,0 +1,172 @@ +Spark Configuration +================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Framework Bundle` :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Global` :bdg-success:`Pipeline` + * - **Databricks Docs:** + - https://spark.apache.org/docs/latest/configuration.html + +The Spark Configuration feature allows you to define and manage Spark configurations either globally at framework level across all pipelines or at pipeline bundle level. + +Configuration +------------- + +| **Scope: Global** +| In the Framework bundle, Spark configurations are defined in the global configuration file located at: ``src/config/global.json|yaml`` under the ``spark_config`` section. + +| **Scope: Bundle** +| In a Pipeline bundle, Spark configurations are defined in the global configuration file located at: ``src/pipeline_configs/global.json|yaml`` under the ``spark_config`` section. + +Configuration Schema +-------------------- + +The Spark configuration section must follow this schema: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "spark_config": { + "": "", + ... + } + } + + .. tab:: YAML + + .. code-block:: yaml + + spark_config: + : + ... + +.. list-table:: + :header-rows: 1 + + * - Field + - Description + * - **configuration_key** + - The Spark configuration property name (e.g., "spark.sql.shuffle.partitions") + * - **configuration_value** + - The value to set for the configuration property. Can be string, number, or boolean depending on the property. + +Common Configuration Properties +----------------------------- + +Here are some commonly used Spark configuration properties: + +.. list-table:: + :header-rows: 1 + + * - Property + - Description + - Default Value + * - **spark.sql.shuffle.partitions** + - Number of partitions to use for shuffle operations + - 200 + * - **spark.sql.files.maxPartitionBytes** + - Maximum size of a partition during file read + - 128MB + * - **spark.sql.adaptive.enabled** + - Enable adaptive query execution + - true + * - **spark.sql.broadcastTimeout** + - Timeout in seconds for broadcast joins + - 300 + +Example Configuration +------------------- + +Below is an example of a typical Spark configuration in the `global.json|yaml` file: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "spark_config": { + "spark.sql.shuffle.partitions": "200", + "spark.sql.adaptive.enabled": "true", + "spark.sql.files.maxPartitionBytes": "134217728", + "spark.sql.broadcastTimeout": "300" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + spark_config: + spark.sql.shuffle.partitions: '200' + spark.sql.adaptive.enabled: 'true' + spark.sql.files.maxPartitionBytes: '134217728' + spark.sql.broadcastTimeout: '300' + +.. admonition:: Best Practice + :class: note + + * Start with the default Spark configurations and adjust based on your specific workload needs + * Monitor query performance and resource utilization to optimize configurations + * Document any non-standard configuration changes and their rationale + * Test configuration changes in development before applying to production + +.. Note:: + Some Spark configurations may be overridden by Databricks cluster configurations or job-specific settings. Refer to the Databricks documentation for the configuration precedence rules. + +Advanced Usage +------------- + +Dynamic Configuration +^^^^^^^^^^^^^^^^^^^ + +For certain use cases, you may want to set different Spark configurations based on the environment or workload. This can be achieved using environment variables or the substitutions feature of the framework. + +Example with environment-specific configurations: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "spark_config": { + "spark.sql.shuffle.partitions": "${SHUFFLE_PARTITIONS}", + "spark.sql.adaptive.enabled": "${ADAPTIVE_EXECUTION_ENABLED}" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + spark_config: + spark.sql.shuffle.partitions: ${SHUFFLE_PARTITIONS} + spark.sql.adaptive.enabled: ${ADAPTIVE_EXECUTION_ENABLED} + +Performance Tuning +^^^^^^^^^^^^^^^^ + +When tuning Spark configurations for performance: + +1. Start with the defaults +2. Monitor query performance and resource utilization +3. Identify bottlenecks +4. Adjust relevant configurations +5. Test and measure impact +6. Document successful optimizations + +.. admonition:: Important + :class: warning + + Incorrect Spark configurations can significantly impact performance and stability. Always test configuration changes in a development environment first. \ No newline at end of file diff --git a/docs/source/feature_spec_format.rst b/docs/source/feature_spec_format.rst new file mode 100644 index 0000000..ce95343 --- /dev/null +++ b/docs/source/feature_spec_format.rst @@ -0,0 +1,393 @@ +Data Flow Specification Format +============================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Framework` :bdg-info:`Pipeline` + +Overview +-------- +The Framework supports both JSON and YAML formats for defining pipeline specifications, providing flexibility in how you author and maintain your data flow specs, substitution files, secrets files, and other configuration files. + +.. important:: + + The specification format applies to all configuration files in a Pipeline Bundle, including: + + - Data flow specifications (main specs and flow groups) + - Data quality expectations + - Substitution files + - Secrets files + +This feature allows development teams to choose the format that best suits their workflow and preferences, while maintaining full compatibility with the Framework's validation and execution capabilities. + +.. note:: + + Both formats are functionally equivalent and fully interchangeable. The choice between JSON and YAML is purely a matter of preference and workflow requirements. + +Configuration +------------- + +The specification format can be configured at two levels: + +1. **Framework Level**: Global configuration that applies to all Pipeline Bundles +2. **Pipeline Level**: Pipeline-specific configuration that can override the global setting (if allowed) + +Framework-Level Configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +| **Scope: Framework** +| The global specification format is defined in the Framework's global configuration file: ``src/config/global.json|yaml`` + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "pipeline_bundle_spec_format": { + "format": "json", + "allow_override": false + } + } + + .. tab:: YAML + + .. code-block:: yaml + + pipeline_bundle_spec_format: + format: json + allow_override: false + +.. list-table:: + :header-rows: 1 + + * - Field + - Description + - Valid Values + - Default + * - **format** + - The default specification format for all Pipeline Bundles + - ``"json"`` or ``"yaml"`` + - ``"json"`` + * - **allow_override** + - Whether individual Pipeline Bundles can override the global format setting + - ``true`` or ``false`` + - ``false`` + +Pipeline-Level Configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +| **Scope: Pipeline** +| In a Pipeline Bundle, the specification format can be set in the pipeline configuration file: ``src/pipeline_configs/global.json|yaml`` + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "pipeline_bundle_spec_format": { + "format": "yaml" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + pipeline_bundle_spec_format: + format: yaml + +.. important:: + + Pipeline-level overrides are only permitted if ``allow_override`` is set to ``true`` in the Framework's global configuration. If ``allow_override`` is ``false``, attempting to override the format will result in a validation error. + +Supported File Types and Naming Conventions +-------------------------------------------- + +The Framework automatically detects the specification format based on file naming conventions: + +JSON Format +^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - File Type + - File Suffix + * - **Main Specifications** + - ``*_main.json`` + * - **Flow Group Specifications** + - ``*_flow.json`` + * - **Data Quality Expectations** + - ``*_dqe.json`` + * - **Secrets Files** + - ``*_secrets.json`` + * - **Substitution Files** + - ``*_substitutions.json`` + +YAML Format +^^^^^^^^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 40 60 + + * - File Type + - File Suffix + * - **Main Specifications** + - ``*_main.yaml`` or ``*_main.yml`` + * - **Flow Group Specifications** + - ``*_flow.yaml`` or ``*_flow.yml`` + * - **Data Quality Expectations** + - ``*_expectations.yaml`` or ``*_expectations.yml`` + * - **Secrets Files** + - ``*_secrets.yaml`` or ``*_secrets.yml`` + * - **Substitution Files** + - ``*_substitutions.yaml`` or ``*_substitutions.yml`` + +.. note:: + + The Framework supports both ``.yaml`` and ``.yml`` extensions for YAML files. Use whichever convention your team prefers, but be consistent within a Pipeline Bundle. + +Example Specification +---------------------- + +The following example shows a data flow specification in both JSON and YAML formats: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "customer_main", + "dataFlowGroup": "customers", + "dataFlowType": "standard", + "sourceSystem": "sourceA", + "sourceType": "autoloader", + "sourceFormat": "json", + "sourceDetails": { + "path": "${base_data_dir}/customer_data", + "readerOptions": { + "cloudFiles.format": "json", + "cloudFiles.inferColumnTypes": "true" + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "dataQualityExpectationsEnabled": true, + "quarantineMode": "on" + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: customer_main + dataFlowGroup: customers + dataFlowType: standard + sourceSystem: sourceA + sourceType: autoloader + sourceFormat: json + sourceDetails: + path: ${base_data_dir}/customer_data + readerOptions: + cloudFiles.format: json + cloudFiles.inferColumnTypes: 'true' + mode: stream + targetFormat: delta + targetDetails: + table: customer + tableProperties: + delta.enableChangeDataFeed: 'true' + dataQualityExpectationsEnabled: true + quarantineMode: on + +Best Practices +-------------- + +1. **Choose One Format Globally**: While technically possible to mix formats across Bundles, it's recommended to standardise on a single format. + +2. **Version Control Considerations**: YAML may produce cleaner diffs in version control systems due to its more human-readable format and lack of trailing commas. + +3. **Validation**: Always validate specifications after conversion or manual edits using the Framework's built-in validation capabilities. + +4. **Schema Files**: Schema files (``*_schema.json``) remain in JSON or DDL format regardless of the specification format setting, as JSON is the format for schema definitions. + +Configuration Examples +---------------------- + +Example 1: Framework Enforces JSON Format +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Framework Configuration** (``src/config/global.json|yaml``): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "pipeline_bundle_spec_format": { + "format": "json", + "allow_override": false + } + } + + .. tab:: YAML + + .. code-block:: yaml + + pipeline_bundle_spec_format: + format: json + allow_override: false + +**Result**: All Pipeline Bundles must use JSON format. Pipeline-level overrides will be rejected. + +Example 2: Framework Allows Format Flexibility +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Framework Configuration** (``src/config/global.json|yaml``): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "pipeline_bundle_spec_format": { + "format": "json", + "allow_override": true + } + } + + .. tab:: YAML + + .. code-block:: yaml + + pipeline_bundle_spec_format: + format: json + allow_override: true + +**Pipeline Configuration** (``src/pipeline_configs/global.json|yaml``): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "pipeline_bundle_spec_format": { + "format": "yaml" + } + } + + .. tab:: YAML + + .. code-block:: yaml + + pipeline_bundle_spec_format: + format: yaml + +**Result**: This specific Pipeline Bundle will use YAML format, while other bundles will default to JSON unless explicitly overridden. + +Example 3: Framework Defaults to YAML +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**Framework Configuration** (``src/config/global.json|yaml``): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "pipeline_bundle_spec_format": { + "format": "yaml", + "allow_override": false + } + } + + .. tab:: YAML + + .. code-block:: yaml + + pipeline_bundle_spec_format: + format: yaml + allow_override: false + +**Result**: All Pipeline Bundles must use YAML format. This is useful when migrating an entire organization to YAML. + +Troubleshooting +--------------- + +Format Mismatch Errors +^^^^^^^^^^^^^^^^^^^^^^ +**Problem**: Framework reports that files cannot be found or loaded. + +**Solution**: +- Verify the ``format`` setting in both Framework and Pipeline configurations +- Ensure file suffixes match the configured format (e.g., ``*_main.yaml`` for YAML) +- Check that all files in the bundle use consistent naming conventions + +Override Not Permitted +^^^^^^^^^^^^^^^^^^^^^^ +**Problem**: Error message: "Pipeline bundle spec format has been set at global framework level. Override has been disabled." + +**Solution**: +- This occurs when attempting to override the format at Pipeline level when ``allow_override`` is ``false`` +- Either remove the Pipeline-level configuration or request that ``allow_override`` be enabled in the Framework configuration + +Invalid Format Value +^^^^^^^^^^^^^^^^^^^^ +**Problem**: Error message: "Invalid pipeline bundle spec format: " + +**Solution**: +- Ensure the ``format`` field is set to either ``"json"`` or ``"yaml"`` +- Check for typos in the configuration file +- Validate the JSON syntax of the configuration file + +Validation Errors After Conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +**Problem**: YAML files fail validation after conversion from JSON. + +**Solution**: +- Validate the YAML syntax and structure +- Check for data type issues (e.g., boolean values should be ``true``/``false``, not strings) +- Ensure quotes are preserved around string values that look like other types (e.g., ``"true"`` vs ``true``) +- Review the specification for any structural issues + +Mixed Format Detection +^^^^^^^^^^^^^^^^^^^^^^ +**Problem**: Bundle contains both JSON and YAML files with the same base name. + +**Solution**: +- The Framework will load files based on the configured format +- Remove files that don't match the configured format to avoid confusion +- Ensure consistent naming conventions throughout the bundle + +See Also +-------- +- :doc:`feature_substitutions` - Using substitutions in specifications +- :doc:`feature_secrets` - Managing secrets in specifications +- :doc:`feature_validation` - Specification validation + diff --git a/docs/source/feature_sql_source.rst b/docs/source/feature_sql_source.rst new file mode 100644 index 0000000..4c942a6 --- /dev/null +++ b/docs/source/feature_sql_source.rst @@ -0,0 +1,66 @@ +SQL Source +========== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Data Flow Spec` + * - **Databricks Docs:** + - NA + +Overview +-------- +You can specify a SQL query as a source type in your Data Flow Specs. These allow for flexibility and more complex transformations to be +supported, as needed, without overly complicating the Framework. + +Sample Bundle +------------- + +A sample is available in the ``gold_sample`` bundle in the ``src/dataflows/stream_static_samples`` folder and can be seen in the +``dim_customer_sql_main.json`` file. + +Configuration +------------- + +**SQL Query Definition** + +To define a SQL query, you need to create a ``dml`` folder under the base folder for your given Data Flow Spec. +You can then create a ``.sql`` file for your query under this folder. + +For example: + + :: + + my_pipeline_bundle/ + ├── src/ + │ ├── dataflows + │ │ ├── use_case_1 + │ │ │ ├── my_data_flow_spec_main.json + │ │ │ ├── dml + │ │ │ │ └── my_query.sql + │ │ │ ├── expectations + │ │ │ ├── python_functions + │ │ │ └── schemas + + +Your file can contain any SQL supported by Databricks but must ultimately return a dataset as a Single query. +You can use CTEs, subqueries, joins, etc. + +**Substitution Variables** + +You can use substitution variables in your SQL query by using the ``{var}`` syntax. +These will be substituted per the :doc:`feature_substitutions documentation. + +For example: + +.. code-block:: sql + + SELECT * FROM {bronze_schema}.my_table + +**Referencing the Python Source in a Data Flow Spec** + +To reference the Python source in a Data Flow Spec, you need to specify a Python source type in your Data Flow Spec. +Refer to the :doc:`dataflow_spec_ref_source_details` section of the :doc:`dataflow_spec_reference` documentation for more information. diff --git a/docs/source/feature_substitutions.rst b/docs/source/feature_substitutions.rst new file mode 100644 index 0000000..33e45a1 --- /dev/null +++ b/docs/source/feature_substitutions.rst @@ -0,0 +1,332 @@ +Substitutions +============= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Framework Bundle` :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Global` :bdg-success:`Pipeline` + * - **Databricks Docs:** + - NA + +When deploying pipeline bundles to different environments (dev, sit, prod), there will normally be a need to cater for differences in resource names (e.g. schema names, storage accounts, url's ) across environments. The substitutions feature caters for this by allowing you to substitute values in your Data Flow Spec and SQL scripts, with values defined in a configuration file. + +Substitutions can be configured in two ways: + +* **tokenized** + Tokens can be included in your Data Flow Spec's or SQL statements, indicated by curly braces and a substitution value can be assigned to them in the substitution config file. + Note, tokens can be applied recursively. +* **prefix/suffix** + Prefixes and suffixes can be assigned to dataflow_spec attributes. This will automatically add the prefix or suffix to value of the attribute in every spot where that attribute is present in a Data Flow Spec even if it is nested. + +There are a few reserved tokens that exist by default. Below is a list of the reserved tokens. + + * *workspace_env*: The target workspace environment, this is the one that appears in the ``databricks.yml`` file + +.. important:: + + Ensure that commonly used substitutions are stored in the Global Framework configuration rather than individual Pipeline Bundles. + + For example, maintain schema names in global substitution files. + +Configuration +------------- + +| **Scope: Global** +| In the Framework bundle, substitutions are defined in the following configuration file: ``src/config/_substitutions.json|yaml`` +| e.g. ``src/config/dev_substitutions.json|yaml`` + +| **Scope: Pipeline** +| In a Pipeline bundle, substitutions are defined in the following configuration file: ``src/pipeline_configs/_substitutions.json|yaml`` +| e.g. ``src/pipeline_configs/dev_substitutions.json|yaml`` + +.. note:: + + The ```` portion of the substitutions config file name must be the same as one of the environment targets listed in the ``databricks.yml`` file, as this determines which environment the bundle will be deployed to. + +.. admonition:: Precedence + :class: note + + The Global substitutions and Pipeline substitutions are merged, with Pipeline substitutions taking precedence. + +Configuration Schema +-------------------- + +The structure of the substitutions config file should be as below: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "tokens": { + "": "", + ... + }, + "prefix_suffix": { + "": { + "prefix | suffix": "" + }, + ... + } + } + + .. tab:: YAML + + .. code-block:: yaml + + tokens: + : + ... + prefix_suffix: + : + prefix | suffix: + ... + +.. list-table:: + :header-rows: 1 + + * - Field + - Description + * - **tokens** + - key-value pairs for tokenized substitutions. + * - **prefix_suffix** + - Object that containing a additional objects defining the substitution behavior for the given attributes. + + * attribute_name: the Data Flow Spec attribute you wish to apply the prefix or suffix to. + * prefix | suffix: the substitution mode. + * value: the value to be added as a prefix or suffix. NOTE: + * the value can be a token. + * workspace_env is a reserved token that can be used to pass through the workspace environment (from the ``databricks.yml`` file). + +Examples +-------- +Below is a sample output of substitutions applied for a given substitutions file and Data Flow Spec. + +Substitutions config: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "tokens": { + "bronze_schema_x": "bronze_marketing", + "bronze_schema_y": "bronze_collections" + }, + "prefix_suffix": { + "database": { + "suffix": "{workspace_env}" + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + tokens: + bronze_schema_x: bronze_marketing + bronze_schema_y: bronze_collections + prefix_suffix: + database: + suffix: '{workspace_env}' + +Data Flow Spec input: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + ... + "flows": { + "f_contract": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_3", + "sourceView": "v_brz_contract" + }, + "views": { + "v_brz_contract": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.{bronze_schema_x}", + "table": "contract", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_loan": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_3", + "sourceView": "v_brz_loan" + }, + "views": { + "v_brz_loan": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.{bronze_schema_y}", + "table": "loan", + "cdfEnabled": true, + } + } + } + }, + ... + } + ... + } + + .. tab:: YAML + + .. code-block:: yaml + + ... + flows: + f_contract: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_3 + sourceView: v_brz_contract + views: + v_brz_contract: + mode: stream + sourceType: delta + sourceDetails: + database: main.{bronze_schema_x} + table: contract + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_loan: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_3 + sourceView: v_brz_loan + views: + v_brz_loan: + mode: stream + sourceType: delta + sourceDetails: + database: main.{bronze_schema_y} + table: loan + cdfEnabled: true + ... + ... + +Data Flow Spec output: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + ... + "flows": { + "f_contract": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_3", + "sourceView": "v_brz_contract" + }, + "views": { + "v_brz_contract": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_marketing_dev", + "table": "contract", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_loan": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_3", + "sourceView": "v_brz_loan" + }, + "views": { + "v_brz_loan": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_collections_dev", + "table": "loan", + "cdfEnabled": true, + } + } + } + }, + ... + } + ... + } + + .. tab:: YAML + + .. code-block:: yaml + + ... + flows: + f_contract: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_3 + sourceView: v_brz_contract + views: + v_brz_contract: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_marketing_dev + table: contract + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_loan: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_3 + sourceView: v_brz_loan + views: + v_brz_loan: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_collections_dev + table: loan + cdfEnabled: true + ... + ... + + +You will notice the database fields all have the workspace environment suffix added to it. + +The tokenized substitution takes place first then we can see there is a suffix of ``dev`` that is added to all fields that have the name database anywhere within the Data Flow Spec \ No newline at end of file diff --git a/docs/source/feature_table_migration.rst b/docs/source/feature_table_migration.rst new file mode 100644 index 0000000..3db50b8 --- /dev/null +++ b/docs/source/feature_table_migration.rst @@ -0,0 +1,223 @@ +Table Migration +=============== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - NA + +The table migration feature allows you to migrate data from existing Delta or SDP tables into a new Spark Declarative Pipeline. It supports both HMS and UC catalogs. + +General Concepts +---------------- + +There are two options for table migration, these are explained in detail below: + +1. **Migration with auto starting version management** + + This is the default mode (``autoStartingVersionsEnabled`` defaults to ``true``). In this mode, the table migration will copy the data from the source table to the target table and will also manage the starting version of the target table. + +2. **Migration without auto starting version management** + + In this mode, the table will be copied into the target table, but any starting versions for the sources of the target table need to be explicitly set in the dataflow specification, within the reader options of the corresponding views. + +Configuration +------------- + +Set as an attribute when creating your Data Flow Spec, refer to the :doc:`dataflow_spec_ref_table_migration` section of the :doc:`dataflow_spec_reference` documentation for more information. + +**Key Configuration Options:** + +* ``enabled``: Boolean flag to enable/disable table migration +* ``catalogType``: Type of catalog (\"hms\" or \"uc\") +* ``autoStartingVersionsEnabled``: Boolean flag to enable automatic starting version management (defaults to ``true``) +* ``sourceDetails``: Configuration for the source table to migrate from + +**Required Global Configuration** + +When table migration is enabled, you must specify the volume path for checkpoint state storage in your ``global.json|yaml`` configuration file at either the framework level (``src/config/global.json|yaml``) or pipeline bundle level (``src/pipeline_configs/global.json|yaml``): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "table_migration_state_volume_path": "/Volumes/{catalog}/{schema}/{volume}/checkpoint_state" + } + + .. tab:: YAML + + .. code-block:: yaml + + table_migration_state_volume_path: /Volumes/{catalog}/{schema}/{volume}/checkpoint_state + +**Parameter Details:** + +* ``table_migration_state_volume_path``: The full path to a volume where checkpoint state files will be stored. This path must: + + - Point to a valid Databricks Unity Catalog volume + - Be accessible by the pipeline with read/write permissions + - Have sufficient storage capacity for checkpoint state files + - Follow the pattern: ``/Volumes/{catalog}/{schema}/{volume}/{path}`` + +**Example Volume Configuration:** + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "table_migration_state_volume_path": "/Volumes/main/lakeflow_samples_staging/stg_volume/checkpoint_state" + } + + .. tab:: YAML + + .. code-block:: yaml + + table_migration_state_volume_path: /Volumes/main/lakeflow_samples_staging/stg_volume/checkpoint_state + +.. note:: + + The volume path is required because table migration uses persistent file storage to maintain checkpoint state across pipeline runs. Without this configuration, table migration will fail during initialization. + +Migration with Auto Starting Versions +-------------------------------------- + +When migrating a table with auto starting version management, you need to migrate in a specific way: + +1. Ensure the ``table_migration_state_volume_path`` is configured in your ``global.json`` file and the volume is accessible. +2. Create your pipeline bundle and dataflow specs with the table migration options appropriately set. +3. Deploy your pipeline bundle but ensure that you do not start the pipeline. +4. Allow any jobs / pipeline currently populating your target table and it's source to complete and then pause the jobs / pipelines populating both the target table and it's sources. +5. Execute your new pipeline manually and once complete, ensure: + + * That the pipeline executed successfully. + * The row count of the target table matches the row count of the table you migrated from. + * The checkpoint state files have been created in your volume storage, and they contain records for each source table including the starting versions. + +6. Enable your pipeline in it's intended trigger mode. + +How it Works +~~~~~~~~~~~~ + +The table migration feature operates through a state management system that ensures data consistency and proper version tracking during the migration process. Here's how it works: + +**1. Migration Manager Initialization** + +When a dataflow specification includes table migration details, the framework initializes a ``TableMigrationManager`` that: + +* Validates the target format is Delta (table migration is only supported for Delta targets) +* Stores migration configuration including source table details and catalog type (HMS or UC) +* Prepares the migration state tracking system + +**2. State Tracking and Version Management** + +For migrations with auto starting version management enabled, the system: + +* Creates or reads from checkpoint state files stored in volumes, partitioned by pipeline ID and table name +* Uses separate storage for initial versions (baseline) and tracking (current state) +* Tracks the current version of each source Delta table using ``DESCRIBE HISTORY`` commands +* Maintains a "ready" flag for each source indicating whether it's safe to start reading from the calculated starting version +* Stores the migration baseline version for each source table in CSV format + +The checkpoint state storage is split into two parts, the initial versions and the tracking. The initial versions contain the baseline version for each source table at the time of migration, and the tracking contains the full state tracking information as described below: + +* ``pipelineId``: The unique identifier of the pipeline +* ``targetTable``: The fully qualified name of the target table +* ``tableName``: The fully qualified name of the source table +* ``viewName``: The name of the view in the pipeline +* ``version``: The baseline version captured during migration +* ``currentVersion``: The latest version of the source table +* ``ready``: Boolean flag indicating if current version > baseline version + +The state tracking store also serves as an audit trail for the migration process, and can be used to verify the migration process and the starting versions applied to the views. + +**3. One-Time Migration Flow** + +During pipeline execution, a special import flow is created that: + +* Reads from the table to be migrated +* Handles both simple append scenarios and CDC Type 2 scenarios with proper delete record handling +* For SCD Type 2 migrations, creates additional views to properly handle delete markers and end-dating logic +* Executes as a "run once" flow to perform the initial data copy + +**4. Starting Version Configuration** + +After the migration completes, for ongoing pipeline execution: + +* **Ready Sources**: Views are configured with ``startingVersion`` set to (baseline_version + 1) to continue reading from where the migration left off +* **Not Ready Sources**: Views are configured with a ``WHERE 1=0`` clause to prevent any data reading until the source table advances past the migration baseline + +**5. Integration with Pipeline Flows** + +The migration system integrates seamlessly with the standard pipeline execution: + +* The migration flow executes once only +* Starting versions are applied directly to the dataflow specification during initialization +* The migration state is persisted in volume-based CSV files with proper partitioning +* The checkpoint state files are automatically maintained and updated as source tables advance + +**6. Post Migration** + +Migration is complete once: + +* The table to be migrated has been successfully copied to the target table +* All sources are "ready" and allowed to flow into the target table again. This can be confirmed by checking the checkpoint state files in the volume storage. + +If migration is complete, you can either completely remove the ``tableMigrationDetails`` from the dataflow specification, or you can set its enabled flag to false. Once the dataflow spec has been updated and redeployed, all migration artifacts (views, checkpoint state files, and run once flow) will be removed and you will be left with a clean pipeline. + +**Technical Implementation Details** + +* **Volume-Based State Storage**: Migration state is stored in CSV files within volumes, providing durability and scalability +* **Partitioned Storage**: Files are partitioned by ``pipelineId`` and ``tableName`` for efficient access and organization +* **Dual Storage Paths**: Separate paths for initial versions (baseline capture) and tracking (ongoing state management) +* **Schema Enforcement**: Explicit schema definitions ensure data consistency across storage operations +* **Direct Specification Modification**: Starting versions are applied directly to the dataflow specification rather than during runtime +* **Automatic State Management**: The system automatically handles reading, writing, and updating of checkpoint state files + +**Storage Structure** + +The checkpoint state is stored in the following structure using the configured ``table_migration_state_volume_path``: + +.. code-block:: + + {table_migration_state_volume_path}/ + ├── initial_versions/ + │ └── pipelineId={pipeline_id}/targetTable={target_table}/ + │ └── part-*.csv # Initial baseline versions + └── tracking/ + └── pipelineId={pipeline_id}/targetTable={target_table}/ + └── part-*.csv # Current tracking state + +For example, with the configuration ``"table_migration_state_volume_path": "/Volumes/main/staging/volume/checkpoint_state"``, the actual file paths would be: + +.. code-block:: + + /Volumes/main/staging/volume/checkpoint_state/ + ├── initial_versions/ + │ └── pipelineId=my_pipeline/targetTable=my_target_table/ + │ └── part-00000-*.csv + └── tracking/ + └── pipelineId=my_pipeline/targetTable=my_target_table/ + └── part-00000-*.csv + +This approach ensures that no data is lost or duplicated during the migration process while maintaining full lineage and audit capabilities. + + +Migration without Auto Starting Versions +----------------------------------------- + +The process will vary here depending on your migration scenario. It is important that: + +* The starting versions are specified in the dataflow specification, prior to publishing your pipeline bundle and executing the pipeline. +* The jobs populating the target table and it's sources are paused, for the duration of the migration. +* You ensure that you get the correct source versions matching the version of the target table, at the time of migration. diff --git a/docs/source/feature_target_types.rst b/docs/source/feature_target_types.rst new file mode 100644 index 0000000..4651c19 --- /dev/null +++ b/docs/source/feature_target_types.rst @@ -0,0 +1,67 @@ +Supported Target Types +====================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Data Flow Spec` + * - **Databricks Docs:** + - https://docs.databricks.com/en/delta-live-tables/python-ref.html + +The Lakeflow Framework supports multiple target types. + +Target Types +------------ +.. list-table:: + :header-rows: 1 + :widths: 20 100 100 + + * - **Type** + - **Description** + - **Key Features** + * - **Delta Streaming Table** + - Creates a streaming Delta table that continuously processes and updates data as it arrives. + - - Streaming write optimizations + - Automatic schema evolution + - Quality enforcement + - Incremental processing + * - **Materialized Views** + - A materialized view is a view that contains precomputed records based on the query that defines the materialized view. Materialized views are commonly used for transformations and Gold Layer tables. + - - Automatic updates based on pipeline schedule/triggers + - Guaranteed consistency with source data + - Incremental refresh optimization + - Ideal for transformations and aggregations + - Pre-computation of slow queries + - Optimization for frequently used computations + - Detailed configuration details: :doc:`feature_materialized_views` + * - **Delta Sink** + - Stream records to a Delta tables. + - - Product documentation: https://docs.databricks.com/en/dlt/dlt-sinks + - Limitations: https://docs.databricks.com/en/dlt/dlt-sinks#limitations + * - **Kafka Sink** + - Stream records to a Kafka topic. + - - Product documentation: https://docs.databricks.com/en/dlt/dlt-sinks + - Limitations: https://docs.databricks.com/en/dlt/dlt-sinks#limitations + * - **Foreach Batch Sink** + - Enables processing each micro-batch with custom logic similar to the Spark Structured Streaming `foreachBatch` functionality. With the ForEachBatch sink, you can transform, merge, or write streaming data to one or more targets that do not natively support streaming writes + - - Product documentation: to be release (Private Preview) + - Limitations: Similar to https://docs.databricks.com/en/dlt/dlt-sinks#limitations + +General Data Flow Spec Configuration +------------------------------------ + +Set as an attribute when creating your Data Flow Spec, refer to the :doc:`dataflow_spec_reference` documentation for more information: + +* :doc:`dataflow_spec_ref_source_details` +* :doc:`dataflow_spec_ref_target_details` + +Detailed Target Type Configuration Details +------------------------------------------ + +.. toctree:: + :maxdepth: 1 + + feature_materialized_views diff --git a/docs/source/feature_templates.rst b/docs/source/feature_templates.rst new file mode 100644 index 0000000..db93b31 --- /dev/null +++ b/docs/source/feature_templates.rst @@ -0,0 +1,470 @@ +Templates +========= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-info:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-info:`Pipeline` + +Overview +-------- +The Dataflow Spec Templates feature allows data engineers to create reusable templates for dataflow specifications. This significantly reduces code duplication when multiple dataflows share similar structures but differ only in specific parameters (e.g., table names, columns, etc.). + +.. important:: + + Templates provide a powerful mechanism for standardizing dataflow patterns across your organization while maintaining flexibility for specific implementations. + +This feature allows development teams to: + +- **Reduce Code Duplication**: Write once, reuse many times +- **Ensure Consistency**: Similar dataflows follow the same structure +- **Improve Productivity**: Quickly create multiple similar specifications +- **Reduce Errors**: Less copy-paste reduces human error +- **Make Patterns Explicit**: Templates make organizational patterns discoverable + +.. note:: + + Template processing happens during the initialization phase of pipeline execution as the dataflow specs are loaded. Each processed spec is validated using the standard validation process. + +How It Works +------------ + +The template system consists of three main components: + +1. **Template Definitions**: JSON files containing template definitions with placeholders +2. **Template Dataflow Specifications**: A dataflow specification that references a template and provides parameter sets +3. **Template Processing**: Framework logic that processes the template dataflow specifications and generates one dataflow spec per parameter set. + + +Anatomy of a Template Definition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A template definition is a JSON file that defines a reusable dataflow pattern. It consists of three main components: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "name": "standard_cdc_template", + "parameters": { + "dataFlowId": { + "type": "string", + "required": true + }, + "sourceDatabase": { + "type": "string", + "required": true + }, + "sourceTable": { + "type": "string", + "required": true + }, + "targetTable": { + "type": "string", + "required": true + } + }, + "template": { + "dataFlowId": "${param.dataFlowId}", + "sourceDetails": { + "database": "${param.sourceDatabase}", + "table": "${param.sourceTable}" + }, + "targetDetails": { + "table": "${param.targetTable}" + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + name: standard_cdc_template + parameters: + dataFlowId: + type: string + required: true + sourceDatabase: + type: string + required: true + sourceTable: + type: string + required: true + targetTable: + type: string + required: true + template: + dataFlowId: ${param.dataFlowId} + sourceDetails: + database: ${param.sourceDatabase} + table: ${param.sourceTable} + targetDetails: + table: ${param.targetTable} + +**Key Components:** + +.. list-table:: + :header-rows: 1 + :widths: 20 80 + + * - Component + - Description + * - **name** + - The unique name for the template. make this the same as the filename. This is currently a placeholder for future functiuonality. + * - **parameters** + - An object defining all parameters that can be used in the template. Each parameter has a ``type`` (string, list, object, integer, boolean) and ``required`` flag (defaults to true). Optional ``default`` values can be specified. + * - **template** + - The dataflow specification template containing placeholders in the format ``${param.}``; where ```` is the name of a parameter defined in the ``parameters`` object. This can be any valid dataflow specification structure with parameters substituted at processing time. + +.. important:: + + - placeholders can be used in both keys, as full values or as part of or a full string value. + - in JSON specs placeholders must always be wrapped in quotes: ``"${param.name}"`` + +**File Location:** +- Template definitions: ``/templates/.json` + + +Anatomy of a Template Dataflow Specification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A template dataflow specification is a simplified file that references a template and provides parameter sets for instantiation. Instead of writing full dataflow specs, data engineers create a template reference: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "template": "standard_cdc_template", + "parameterSets": [ + { + "dataFlowId": "customer_scd2", + "sourceDatabase": "{bronze_schema}", + "sourceTable": "customer_raw", + "targetTable": "customer_scd2" + }, + { + "dataFlowId": "customer_address_scd2", + "sourceDatabase": "{bronze_schema}", + "sourceTable": "customer_address_raw", + "targetTable": "customer_address_scd2" + } + ] + } + + .. tab:: YAML + + .. code-block:: yaml + + template: standard_cdc_template + parameterSets: + - dataFlowId: customer_scd2 + sourceDatabase: '{bronze_schema}' + sourceTable: customer_raw + targetTable: customer_scd2 + - dataFlowId: customer_address_scd2 + sourceDatabase: '{bronze_schema}' + sourceTable: customer_address_raw + targetTable: customer_address_scd2 + +**Key Components:** + +.. list-table:: + :header-rows: 1 + :widths: 25 75 + + * - Component + - Description + * - **template** + - The filename of the template definition to use (without the ``.json`` extension). The framework will search for this template in the configured template directories. + * - **parameterSets** + - An array of parameter sets. Each object in the array represents one set of parameter values that will generate one complete dataflow specification. Each parameter set must include all required parameters defined in the template definition. + +.. important:: + + - Each parameter set must include a unique ``dataFlowId`` value + - The array must contain at least one parameter set + - All required parameters from the template definition must be provided in each parameter set + +**File Location:** + +Template dataflow specifications follow the standard dataflow specification naming convention: ``/dataflows//dataflowspec/*_main.json`` + +**Processing Result:** + +A template dataflow specification with N parameter sets will generate N complete dataflow specifications at runtime, each validated independently. + +Template Processing +^^^^^^^^^^^^^^^^^ + +During the dataflow spec build process, the template processor will: + +1. Detect spec files containing a ``template`` key +2. Loads the referenced template file +3. For each parameter set in ``parameterSets``, create a concrete spec by replacing all ``${param.}`` placeholders +4. Validate each expanded spec using the existing schema validators +5. Return the expanded specs with unique internal identifiers + +Example Usage +------------- + +Example: Basic File Source Ingestion Template +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This example shows a template for basic file source ingestion, from a hypothetical source system called "erp_system". + +**Template Definition** (``src/templates/bronze_erp_system_file_ingestion_template.json|yaml``): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "name": "bronze_erp_system_file_ingestion_template", + "parameters": { + "dataFlowId": { + "type": "string", + "required": true + }, + "sourceTable": { + "type": "string", + "required": true + }, + "schemaPath": { + "type": "string", + "required": true + }, + "targetTable": { + "type": "string", + "required": true + } + }, + "template": { + "dataFlowId": "${param.dataFlowId}", + "dataFlowGroup": "bronze_erp_system", + "dataFlowType": "standard", + "sourceSystem": "erp_system", + "sourceType": "cloudFiles", + "sourceViewName": "v_${param.sourceTable}", + "sourceDetails": { + "path": "{landing_erp_file_location}/${param.sourceTable}/", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "${param.schemaPath}" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "${param.targetTable}" + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + name: bronze_erp_system_file_ingestion_template + parameters: + dataFlowId: + type: string + required: true + sourceTable: + type: string + required: true + schemaPath: + type: string + required: true + targetTable: + type: string + required: true + template: + dataFlowId: ${param.dataFlowId} + dataFlowGroup: bronze_erp_system + dataFlowType: standard + sourceSystem: erp_system + sourceType: cloudFiles + sourceViewName: v_${param.sourceTable} + sourceDetails: + path: '{landing_erp_file_location}/${param.sourceTable}/' + readerOptions: + cloudFiles.format: csv + header: 'true' + schemaPath: ${param.schemaPath} + mode: stream + targetFormat: delta + targetDetails: + table: ${param.targetTable} + +**Template Dataflow Specification** (``src/dataflows/bronze_erp_system/dataflowspec/bronze_erp_system_file_ingestion_main.json|yaml``): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "template": "bronze_erp_system_file_ingestion_template", + "parameterSets": [ + { + "dataFlowId": "customer_file_source", + "sourceTable": "customer", + "schemaPath": "customer_schema.json", + "targetTable": "customer" + }, + { + "dataFlowId": "customer_address_file_source", + "sourceTable": "customer_address", + "schemaPath": "customer_address_schema.json", + "targetTable": "customer_address" + }, + { + "dataFlowId": "supplier_file_source", + "sourceTable": "supplier", + "schemaPath": "supplier_schema.json", + "targetTable": "supplier" + } + ] + } + + .. tab:: YAML + + .. code-block:: yaml + + template: bronze_erp_system_file_ingestion_template + parameterSets: + - dataFlowId: customer_file_source + sourceTable: customer + schemaPath: customer_schema.json + targetTable: customer + - dataFlowId: customer_address_file_source + sourceTable: customer_address + schemaPath: customer_address_schema.json + targetTable: customer_address + - dataFlowId: supplier_file_source + sourceTable: supplier + schemaPath: supplier_schema.json + targetTable: supplier + +**Result**: This template dataflow specification generates **3 concrete dataflow specs**, one for each parameter set in the ``parameterSets`` array. + + +Parameter Types +--------------- + +Parameters support multiple data types and structures: + +.. list-table:: + :header-rows: 1 + :widths: 20 30 50 + + * - Type + - Template Usage + - Example + * - **Strings** + - ``"${param.tableName}"`` + - ``"tableName": "customer"`` + * - **Numbers** + - ``${param.batchSize}`` + - ``"batchSize": 1000`` + * - **Booleans** + - ``${param.enabled}`` + - ``"enabled": true`` + * - **Arrays** + - ``${param.keyColumns}`` + - ``"keyColumns": ["ID", "DATE"]`` + * - **Objects** + - ``${param.config}`` + - ``"config": {"key": "value"}`` + +Key Features +------------ + +Python Function Path Search Priority +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Enhanced fallback chain for path values. + +The framework searches for python function path values in the following order: + +1. In the pipeline bundle base path of the dataflow spec file +2. Under the templates directory of the pipeline bundle +3. Under the extensions directory of the pipeline bundle +4. Under the framework extensions directory + +Error Handling +^^^^^^^^^^^^^^ + +The framework provides clear error messages for common issues: + +- **Missing template file**: Lists all searched locations +- **Missing parameters**: Warns about unreplaced placeholders +- **Invalid JSON**: Shows parsing errors with context +- **Validation errors**: Each expanded spec is validated individually + +Validation +^^^^^^^^^^ + +Each expanded spec is validated using the existing schema validators to ensure correctness. + +Template usage specs are validated against the schema at ``src/schemas/spec_template.json``: + +- ``template``: Required string (template name without .json extension) +- ``params``: Required array with at least one parameter object +- Each parameter object must be a dictionary with at least one key-value pair + +Unique Identifiers +^^^^^^^^^^^^^^^^^^ + +Generated specs receive unique internal keys in the format ``path#template_0``, ``path#template_1``, etc., to ensure proper tracking and debugging. + +Best Practices +-------------- + +Naming Conventions +^^^^^^^^^^^^^^^^^^ + +1. **Template Files**: Use descriptive names ending with ``_template`` (e.g., ``standard_cdc_template.json``) +2. **Parameter Names**: Use clear, descriptive names (e.g., ``sourceTable`` instead of ``st``) +3. **Consistency**: Maintain consistent naming patterns across related templates + +Development and Testing +^^^^^^^^^^^^^^^^^^^^^^^ + +1. **Concrete First**: Develop a concrete dataflow spec first, get it working and then turn it into a template defintion. +1. **Validation**: Always test processed specs by running the pipeline with a small subset of data +2. **Version Control**: Track templates in version control to maintain a history of changes +3. **Iterative Development**: Start with a simple template and enhance it as patterns emerge + +Maintainability +^^^^^^^^^^^^^^^ + +1. **Template Updates**: When updating a template, test all usages to ensure compatibility +2. **Parameter Validation**: Document required parameters for each template +3. **Backwards Compatibility**: Consider versioning templates if making breaking changes + +Limitations +----------- + +The current template implementation has the following limitations, which may be addressed in future versions: + +1. **No Template Sub Components (Blocks)**: Templates cannot reference other templates or smaller template blocks +2. **No Conditional Logic**: Complex conditional logic is not supported (consider using multiple templates) + +.. note:: + + For complex conditional logic requirements, create multiple templates that represent different scenarios rather than trying to implement logic within a single template. diff --git a/docs/source/feature_ui_integration.rst b/docs/source/feature_ui_integration.rst new file mode 100644 index 0000000..748b88c --- /dev/null +++ b/docs/source/feature_ui_integration.rst @@ -0,0 +1,49 @@ +UI Integration +============================= + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`VS Code Settings` + * - **Databricks Docs:** + - NA + +Overview +-------- +The framework integrates with the Databricks Pipeline Editor UI to provide a seamless experience when creating or editing Data Flow specifications. This allows for live validation and editing of a Data Flow spec while having the ability to debug natively in the Pipeline Editor. + +Configuration to enable Pipeline Editor UI Integration +-------------------------------------- +To enable UI Integration, you need to add a root_path configuration to your Pipeline resource file that points to the root directory of the Data Flow specifications for that pipeline as shown below. + +.. code-block:: yaml + + resources: + pipelines: + lakeflow_samples_bronze_base_pipeline: + name: Lakeflow Framework - Bronze - Base Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: base_samples + **root_path**: ${workspace.file_path}/src/dataflows/base_samples/dataflowspec + +Below is an example of the Pipeline Editor UI integration. + +.. image:: images/screenshot_pipeline_editor_ui.png + :alt: Pipeline Editor UI diff --git a/docs/source/feature_validation.rst b/docs/source/feature_validation.rst new file mode 100644 index 0000000..50facac --- /dev/null +++ b/docs/source/feature_validation.rst @@ -0,0 +1,93 @@ +Validation +========== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Pipeline Bundle` + * - **Databricks Docs:** + - NA + +Overview +-------- +The framework uses the Python ``jsonschema`` library to define the schema and validation rules for the: + +- Data Flow Specifications +- Expectations +- Secrets Configurations + +This provides the following functionality: + +- :doc:`feature_auto_complete` +- Validation in your CI/CD pipelines +- Validation at Spark Declarative Pipeline initialization time + +How Validation Works +-------------------- + +The framework uses the ``jsonschema`` library to validate the Data Flow Specifications, Expectations, and Secrets Configurations. +Essentially each time a pipeline executes the following steps are performed: + +.. list-table:: + :widths: 10 30 60 + :header-rows: 1 + + * - Step + - Name + - Description + * - 1 + - Load and Initialize Framework + - Load and initialize the Framework + * - 2 + - Retrieve Data Flow Specifications + - + a. **Retrieve and validate:** + - Read and validate ALL the Data Flow Specifications, Expectations, and Secrets Configurations from the workspace files location of the Pipeline Bundle. + - If a file is not valid it will be added to an error list. + - If any files failed validation, the pipeline will fail and the user will receive a list of validation errors. + b. **Apply pipeline filters:** + - The framework will apply any pipeline filters to the in memory dictionary. + - The only exception to this is the File Filter which means the framework will specifically only read that file(s). + * - 3 + - Generate Pipeline Definition + - The Framework will then use the in memory dictionary to initialize the Spark Declarative Pipeline. + * - 4 + - Execute Pipeline + - The pipeline will then execute the logic defined in the Data Flow Specifications. + +Ignoring Validation Errors +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ignoring validation errors can be useful when iterating in Dev or SIT environments and you want to focus on specific Data Flow Specs (selected by your pipeline filters), without being blocked by validation errors. + +You can ignore validation errors by setting the ``pipeline.ignoreValidationErrors`` configuration to ``True``. + +You can do this in the pipeline resource YAML file or via the Databricks UI in the Spark Declarative Pipeline Settings. + +.. code-block:: yaml + :emphasize-lines: 21 + + resources: + pipelines: + dlt_framework_samples_bronze_base_pipeline: + name: Lakeflow Framework Samples - Bronze - Base Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: base_samples + pipeline.ignoreValidationErrors: True diff --git a/docs/source/feature_versioning_dataflow_spec.rst b/docs/source/feature_versioning_dataflow_spec.rst new file mode 100644 index 0000000..74096f0 --- /dev/null +++ b/docs/source/feature_versioning_dataflow_spec.rst @@ -0,0 +1,272 @@ +Versioning - DataFlow Specs +=========================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Pipeline Bundle` + * - **Configuration Scope:** + - :bdg-success:`Global` :bdg-success:`Individual DataFlow Specs` + * - **Databricks Docs:** + - NA + +Overview +-------- +The Lakeflow Framework supports dataflow specification versioning to enable backwards compatibility and gradual migration of dataflow specifications. +This feature allows different mapping versions to be applied to transform the structure and content of dataflow specifications during processing. This is particularly useful for: + +- Maintaining backwards compatibility when dataflow specification schemas evolve +- Gradually migrating existing dataflow specifications to new formats +- Testing new specification formats without breaking existing workflows +- Restructuring dataflow specifications to accommodate schema changes + +The versioning system applies transformation mappings that can rename fields, move content to different locations, and remove obsolete fields in the dataflow specifications, allowing older specification formats to work with newer framework versions. + + +Mapping File Structure +---------------------- +DataFlow specification mappings are stored in version-specific directories under: +``src/config/dataflow_spec_mapping/[version]/dataflow_spec_mapping.json`` + +Each mapping file contains transformation rules organized by: + +- **global**: Mappings applied to all dataflow specification types +- **[dataflow_type]**: Mappings applied only to specific dataflow types (e.g., "standard", "flow", "materialized_view") + +The mapping file supports three types of transformations: + +1. **Renaming**: Change key names while preserving structure +2. **Moving**: Relocate keys and values to different parts of the specification +3. **Deleting**: Remove obsolete keys and their values + +Mapping Operations +------------------ + +Rename Operations +~~~~~~~~~~~~~~~~~ +Rename operations change key names while preserving the value and structure. Two types of renaming are supported: + +- **rename_all**: Recursively renames keys throughout the entire specification structure +- **rename_specific**: Renames keys at specific nested paths + +.. code-block:: json + + { + "global": { + "rename_all": { + "oldKeyName": "newKeyName", + "cdcApplyChanges": "cdcSettings" + }, + "rename_specific": { + "targetDetails.topic": "targetDetails.name" + } + } + } + +Move Operations +~~~~~~~~~~~~~~~ +Move operations relocate keys and their values to different locations within the specification structure using dot notation for nested paths: + +.. code-block:: json + + { + "global": { + "move": { + "legacyConfig": "settings.advanced.legacyConfig", + "sourceMetadata": "targetDetails.metadata", + "targetDetails.topic": "targetDetails.sinkOptions.topic" + } + } + } + +Delete Operations +~~~~~~~~~~~~~~~~~ +Delete operations remove keys and their values from the specification: + +.. code-block:: json + + { + "global": { + "delete": [ + "deprecatedField", + "obsoleteTimestamp" + ] + } + } + +Complete Example +~~~~~~~~~~~~~~~~ +A comprehensive mapping file combining all operation types: + +.. code-block:: json + + { + "global": { + "rename_all": { + "cdcApplyChanges": "cdcSettings", + "cdcApplyChangesFromSnapshot": "cdcSnapshotSettings" + }, + "rename_specific": { + "targetDetails.topic": "targetDetails.name" + }, + "move": { + "legacyConfig": "settings.advanced.legacyConfig", + "targetDetails.topic": "targetDetails.sinkOptions.topic" + }, + "delete": [ + "deprecatedField", + "obsoleteTimestamp" + ] + }, + "flow": { + "rename_all": { + "flowSpecificOldField": "flowSpecificNewField" + }, + "move": { + "flowConfig": "flowGroups.0.configuration" + }, + "delete": [ + "temporaryField" + ] + } + } + +Configure Global DataFlow Version +---------------------------------- +To set a global dataflow specification version that applies to all specifications in a pipeline, configure the ``dataflow_spec_version`` parameter in your pipeline configuration. + +This can be set in your pipeline substitutions file: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataflow_spec_version": "0.1.0" + } + + .. tab:: YAML + + .. code-block:: yaml + + dataflow_spec_version: 0.1.0 + +When a global version is set, all dataflow specifications in the pipeline will use this mapping version unless overridden at the individual specification level. + + +Configure Individual DataFlow Specification Version +---------------------------------------------------- +Individual dataflow specifications can override the global version by setting the ``dataFlowVersion`` field in their specification file: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "customer_data_flow", + "dataFlowGroup": "customers", + "dataFlowType": "standard", + "dataFlowVersion": "0.1.0", + "data": { + "sourceType": "delta", + "targetFormat": "delta", + "targetDetails": { + "table": "customers" + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: customer_data_flow + dataFlowGroup: customers + dataFlowType: standard + dataFlowVersion: 0.1.0 + data: + sourceType: delta + targetFormat: delta + targetDetails: + table: customers + +The individual specification version takes precedence over the global version for that specific dataflow. + +.. note:: + If neither global nor individual dataflow version is specified, no mappings will be applied and the specifications will be processed with their original structure. + + +Transformation Order +-------------------- +The framework applies transformations in a specific order to ensure consistent results: + +1. **Move Operations**: Keys are copied to their new locations first +2. **Rename Specific Operations**: Keys are renamed using ``rename_specific`` (targeted) +3. **Rename All Operations**: Keys are renamed using ``rename_all`` (recursive) +4. **Move Cleanup**: Original source keys from move operations are removed +5. **Delete Operations**: Keys specified in delete operations are removed + + +Best Practices +-------------- +1. **Default Behavior** + - Only use dataflow versioning when backwards compatibility is required + - New specifications should use the current schema format without versioning + +2. **Version Consistency** + - Use consistent version numbers across related dataflow specifications + +3. **Migration Strategy** + - Start with global version configuration for bulk migrations + - Use individual specification versions for gradual, selective migration + - Remove version specifications once migration is complete + - Plan transformation order carefully when combining multiple operation types + +4. **Path Notation** + - Use dot notation for nested paths (e.g., ``"parentKey.childKey.grandchildKey"``) + - Ensure target paths exist or can be created for move operations + - Be careful with path conflicts when moving and renaming the same keys + +5. **Testing** + - Thoroughly test mapping transformations in development environments + - Validate that transformed specifications maintain expected functionality + - Test all operation types (rename, move, delete) both individually and in combination + - Keep original specifications as backup during migration + + +Version Management +------------------ +1. Mapping versions should follow semantic versioning (MAJOR.MINOR.PATCH) +2. Each mapping version should be stored in its own directory under ``src/config/dataflow_spec_mapping/`` +3. Maintain documentation of what each version transforms and why +4. Keep mapping files immutable once deployed to ensure consistency +5. Create new mapping versions rather than modifying existing ones +6. Archive obsolete mapping versions only after confirming no pipelines depend on them + + +Troubleshooting +--------------- +**Common Issues:** + +- **Mapping not applied**: Verify the version string exactly matches the directory name under ``dataflow_spec_mapping/`` +- **Key not transformed**: Check that the key exists in the appropriate section (global or type-specific) of the mapping file +- **Specification validation errors**: Ensure transformed keys match the expected schema after mapping application +- **Move operation fails**: Verify source keys exist and target paths are valid using dot notation +- **Path conflicts**: Check for conflicts between move targets and existing keys +- **Delete operation errors**: Ensure keys to be deleted exist at the specified paths + +**Debugging:** +Enable debug logging to see transformation application details: + +.. code-block:: python + + # Framework logs will show: + # "Global Dataflow Spec Mapping Version: [version]" + # "Retrieved Dataflow Spec Specific Mapping Version: [version]. Dataflow Spec ID: [id]" + # "Mapping applied to spec: [spec_path]" + # "New spec: [spec_data]" \ No newline at end of file diff --git a/docs/source/feature_versioning_framework.rst b/docs/source/feature_versioning_framework.rst new file mode 100644 index 0000000..1c5c0a4 --- /dev/null +++ b/docs/source/feature_versioning_framework.rst @@ -0,0 +1,100 @@ +Versioning - Framework +====================== + +.. list-table:: + :header-rows: 0 + + * - **Applies To:** + - :bdg-success:`Framework Bundle` + * - **Configuration Scope:** + - :bdg-success:`Global` + * - **Databricks Docs:** + - NA + +Overview +-------- +The Lakeflow Framework supports versioning to allow different pipelines to use specific versions of the framework. +This feature enables deploying specific versions of the framework to target environments. This is particularly useful for: + +- Rolling back pipelines to use previous versions of the framework where current version is not suitable +- Testing new framework versions with specific pipelines +- Supporting gradual framework upgrades across different pipelines where some pipelines are not ready to upgrade yet + +In production environments (and CI/CD pipelines in general), the framework should be deployed twice: +1. First deployment with version set to "current" (default) +2. Second deployment with version set to a specific version number + +This dual deployment strategy ensures that a previous stable version of the framework is always available for rollback purposes. + + +Deploy the framework with a specific version +----------------------------------------------- +The framework's version is configured in the databricks.yaml file, defaulting to "current": + +.. code-block:: yaml + + variables: + version: + description: The framework version to deploy this bundle as + default: current + +To deploy a specific version, override the default using the ``BUNDLE_VAR_version`` environment variable: + +.. code-block:: bash + + export BUNDLE_VAR_version="1.2.3" + +For CI/CD deployments, execute two deployments: + +.. code-block:: bash + + # First deployment - latest version + export BUNDLE_VAR_version="current" + databricks bundle deploy + + # Second deployment - specific version for rollback + export BUNDLE_VAR_version="1.2.3" + databricks bundle deploy + + +Set framework version for pipelines +----------------------------------- +By default, all pipelines (including production) should use the "current" version of the framework. Version locking should only be used in rollback scenarios when issues are discovered. + +To specify which framework version a pipeline should use, set the ``framework_source_path`` variable in the pipeline bundle. The path follows this pattern: +``/Workspace/Users/[user]/.bundle/[framework_name]/[target]/[version]/files/src`` + +Set this path using the ``BUNDLE_VAR_framework_source_path`` environment variable during pipeline deployment: + +.. code-block:: bash + + export BUNDLE_VAR_framework_source_path="/Workspace/Users/[user]/.bundle/[framework_name]/[target]/[version]/files/src" + +.. note:: + The ``framework_source_path`` setting applies to all pipelines in the bundle. While individual pipeline versions can be + modified directly in the Databricks workspace, this is not recommended for production environments. + +Best Practices +-------------- +1. Default Version + - Always default to the "current" version for both development and production pipelines + - This ensures you benefit from the latest features and fixes + +2. Version Locking + - Only lock to specific versions during rollback scenarios + - Return to "current" once issues are resolved + +3. Rollback Strategy + - In case of issues, quickly rollback by specifying the previous working version + - Update the framework_source_path in pipeline configuration to point to the previous version + + + +Version Management +------------------ +1. Framework versions should follow semantic versioning (MAJOR.MINOR.PATCH) +2. Each release should be tagged in the source control system +3. The "current" version always points to the latest stable release +4. Previous versions are maintained for rollback purposes +5. Keep a changelog of versions and their changes + diff --git a/docs/source/features.rst b/docs/source/features.rst new file mode 100644 index 0000000..403b9d4 --- /dev/null +++ b/docs/source/features.rst @@ -0,0 +1,37 @@ +Framework Features +================== + +.. toctree:: + :maxdepth: 1 + + feature_auto_complete + feature_builder_parallelization + feature_cdc + feature_cdf + feature_data_quality_expectations + feature_data_quality_quarantine + feature_direct_publishing_mode + feature_liquid_clustering + feature_logging + feature_logical_environment + feature_materialized_views + feature_mandatory_table_properties + feature_multi_source_streaming + feature_operational_metadata + feature_python_dependency_management + feature_python_extensions + feature_python_functions + feature_schemas + feature_spec_format + feature_secrets + feature_soft_deletes + Source Types + feature_spark_configuration + feature_substitutions + feature_table_migration + Target Types + feature_templates + feature_validation + feature_versioning_dataflow_spec + feature_versioning_framework + feature_ui_integration diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst new file mode 100644 index 0000000..f03ef1b --- /dev/null +++ b/docs/source/getting_started.rst @@ -0,0 +1,32 @@ +Getting Started +=============== + +The following section is a quick start guide on how to get started with the Lakeflow Framework as a data engineer. + +Pre-Requisites +-------------- + +1. Databricks CLI installed and configured, if you are using DABs to locally deploy the Lakeflow Framework and Pipeline Bundles. +2. Access to a Databricks workspace. +3. VSCode installed. + +Setup +----- + +Follow the below steps to get yourself setup to learn and use the Lakeflow Framework: + +1. :doc:`deploy_framework` +2. :doc:`deploy_samples` +3. :doc:`feature_auto_complete` + +Understanding the Framework +--------------------------- +1. :doc:`concepts` +2. Step through and execute one of the basic samples and inspect the create_dataflow_spec +3. :doc:`features` + +Developing your first Pipeline Bundle +------------------------------------- + +1. Select from one of the recommended pipeline patterns that best fits your use case, as documented in :doc:`patterns` +2. Build and deploy a data pipeline bundle. Refer to :doc:`build_pipeline_bundle`. \ No newline at end of file diff --git a/docs/source/how_to_write_unit_tests.rst b/docs/source/how_to_write_unit_tests.rst new file mode 100644 index 0000000..6a4fc68 --- /dev/null +++ b/docs/source/how_to_write_unit_tests.rst @@ -0,0 +1,194 @@ +How to Create Unit Test Functions for a Project Using `pytest` and `unittest` +========================================================================== + +Prerequisites +------------- +1. **Install `pytest` and `unittest` (if not already installed):** + + - Install `pytest`: + ```bash + pip install pytest + ``` + - `unittest` is included in Python’s standard library, so no additional installation is required. + +2. **Project Setup:** + + Organise your project files in a logical structure: +:: + + my_project/ + ├── src/ + │ ├── module.py # Your source code + ├── tests/ + │ ├── test_module.py # Your test files + ├── requirements.txt + └── README.md + +Writing Unit Test Functions with `pytest` +----------------------------------------- + +1. **Test File Naming:** + + Name test files starting with `test_` or ending with `_test.py`, e.g., `test_module.py`. + +2. **Test Function Naming:** + + Name test functions starting with `test_`, e.g., `test_add_function`. + +3. **Basic Structure:** + + .. code-block:: python + + import pytest + from src.module import add + + def test_add_function(): + # Arrange + a, b = 2, 3 + + # Act + result = add(a, b) + + # Assert + assert result == 5 + +4. **Using Fixtures:** + + Use `@pytest.fixture` to set up reusable test data or objects. + + .. code-block:: python + + @pytest.fixture + def sample_data(): + return {"a": 2, "b": 3} + + def test_add_function_with_fixture(sample_data): + result = add(sample_data["a"], sample_data["b"]) + assert result == 5 + +5. **Running Tests:** + + - Run all tests: + ```bash + pytest + ``` + - Run a specific test file: + ```bash + pytest tests/test_module.py + ``` + +Writing Unit Test Functions with `unittest` +------------------------------------------- + +1. **Test File Naming:** + + The naming convention is not strict, but `test_*.py` is commonly used. + +2. **Basic Structure:** + + Inherit from `unittest.TestCase` to define test cases. + + .. code-block:: python + + import unittest + from src.module import add + + class TestAddFunction(unittest.TestCase): + def test_add(self): + # Arrange + a, b = 2, 3 + + # Act + result = add(a, b) + + # Assert + self.assertEqual(result, 5) + + if __name__ == "__main__": + unittest.main() + +3. **Using `setUp` and `tearDown`:** + + Define `setUp` and `tearDown` methods for reusable setup and cleanup logic. + + .. code-block:: python + + class TestAddFunction(unittest.TestCase): + def setUp(self): + self.a = 2 + self.b = 3 + + def tearDown(self): + # Clean up resources, if any + pass + + def test_add(self): + result = add(self.a, self.b) + self.assertEqual(result, 5) + +4. **Running Tests:** + + - Run all tests in a file: + ```bash + python -m unittest discover + ``` + - Run a specific test file: + ```bash + python -m unittest tests/test_module.py + ``` + +General Tips for Writing Unit Tests +----------------------------------- + +1. **Follow the AAA Pattern:** + + - **Arrange:** Set up test data and preconditions. + - **Act:** Execute the function or method under test. + - **Assert:** Verify the result matches expectations. + +2. **Use Mocks and Stubs:** + + Use `unittest.mock` or `pytest-mock` to replace external dependencies or isolate units. + +3. **Group Related Tests:** + + Use classes or organise tests logically to improve readability. + +4. **Test Edge Cases:** + + Test normal, boundary, and error conditions. + +Example: Pytest vs. Unittest +---------------------------- + +**Using `pytest`:** + +.. code-block:: python + + import pytest + from src.module import divide + + def test_divide_by_nonzero(): + assert divide(10, 2) == 5 + + def test_divide_by_zero(): + with pytest.raises(ZeroDivisionError): + divide(10, 0) + +**Using `unittest`:** + +.. code-block:: python + + import unittest + from src.module import divide + + class TestDivideFunction(unittest.TestCase): + def test_divide_by_nonzero(self): + self.assertEqual(divide(10, 2), 5) + + def test_divide_by_zero(self): + with self.assertRaises(ZeroDivisionError): + divide(10, 0) + +Both frameworks achieve the same goals but differ in syntax and flexibility. Choose based on project requirements and personal preference. + diff --git a/docs/source/images/basic_1_1.png b/docs/source/images/basic_1_1.png new file mode 100644 index 0000000..de87f61 Binary files /dev/null and b/docs/source/images/basic_1_1.png differ diff --git a/docs/source/images/cdc_stream_from_snapshot.png b/docs/source/images/cdc_stream_from_snapshot.png new file mode 100644 index 0000000..f74b23f Binary files /dev/null and b/docs/source/images/cdc_stream_from_snapshot.png differ diff --git a/docs/source/images/framework_concept_overview.png b/docs/source/images/framework_concept_overview.png new file mode 100644 index 0000000..76e851e Binary files /dev/null and b/docs/source/images/framework_concept_overview.png differ diff --git a/docs/source/images/mix_and_match.png b/docs/source/images/mix_and_match.png new file mode 100644 index 0000000..6c3a973 Binary files /dev/null and b/docs/source/images/mix_and_match.png differ diff --git a/docs/source/images/pipeline_concepts_overview.png b/docs/source/images/pipeline_concepts_overview.png new file mode 100644 index 0000000..f186cc5 Binary files /dev/null and b/docs/source/images/pipeline_concepts_overview.png differ diff --git a/docs/source/images/screenshot_intellisense_keys.png b/docs/source/images/screenshot_intellisense_keys.png new file mode 100644 index 0000000..5d3d562 Binary files /dev/null and b/docs/source/images/screenshot_intellisense_keys.png differ diff --git a/docs/source/images/screenshot_intellisense_values.png b/docs/source/images/screenshot_intellisense_values.png new file mode 100644 index 0000000..6057e87 Binary files /dev/null and b/docs/source/images/screenshot_intellisense_values.png differ diff --git a/docs/source/images/screenshot_logs_viewing_1.png b/docs/source/images/screenshot_logs_viewing_1.png new file mode 100644 index 0000000..164c792 Binary files /dev/null and b/docs/source/images/screenshot_logs_viewing_1.png differ diff --git a/docs/source/images/screenshot_logs_viewing_2.png b/docs/source/images/screenshot_logs_viewing_2.png new file mode 100644 index 0000000..268ce6b Binary files /dev/null and b/docs/source/images/screenshot_logs_viewing_2.png differ diff --git a/docs/source/images/screenshot_pipeline_editor_ui.png b/docs/source/images/screenshot_pipeline_editor_ui.png new file mode 100644 index 0000000..4fd17d1 Binary files /dev/null and b/docs/source/images/screenshot_pipeline_editor_ui.png differ diff --git a/docs/source/images/screenshot_pipeline_log_level_ui.png b/docs/source/images/screenshot_pipeline_log_level_ui.png new file mode 100644 index 0000000..7aa5c8a Binary files /dev/null and b/docs/source/images/screenshot_pipeline_log_level_ui.png differ diff --git a/docs/source/images/screenshot_pipeline_log_level_yaml.png b/docs/source/images/screenshot_pipeline_log_level_yaml.png new file mode 100644 index 0000000..2a5bb96 Binary files /dev/null and b/docs/source/images/screenshot_pipeline_log_level_yaml.png differ diff --git a/docs/source/images/screenshot_validation_keys.png b/docs/source/images/screenshot_validation_keys.png new file mode 100644 index 0000000..1e399e3 Binary files /dev/null and b/docs/source/images/screenshot_validation_keys.png differ diff --git a/docs/source/images/screenshot_validation_values.png b/docs/source/images/screenshot_validation_values.png new file mode 100644 index 0000000..294de7f Binary files /dev/null and b/docs/source/images/screenshot_validation_values.png differ diff --git a/docs/source/images/screenshot_vscode_json_settings.png b/docs/source/images/screenshot_vscode_json_settings.png new file mode 100644 index 0000000..7305211 Binary files /dev/null and b/docs/source/images/screenshot_vscode_json_settings.png differ diff --git a/docs/source/images/stream_multi_base.png b/docs/source/images/stream_multi_base.png new file mode 100644 index 0000000..69942ee Binary files /dev/null and b/docs/source/images/stream_multi_base.png differ diff --git a/docs/source/images/stream_multi_granular.png b/docs/source/images/stream_multi_granular.png new file mode 100644 index 0000000..06cd165 Binary files /dev/null and b/docs/source/images/stream_multi_granular.png differ diff --git a/docs/source/images/stream_multi_monolithic.png b/docs/source/images/stream_multi_monolithic.png new file mode 100644 index 0000000..4925af5 Binary files /dev/null and b/docs/source/images/stream_multi_monolithic.png differ diff --git a/docs/source/images/stream_static_basic.png b/docs/source/images/stream_static_basic.png new file mode 100644 index 0000000..4773275 Binary files /dev/null and b/docs/source/images/stream_static_basic.png differ diff --git a/docs/source/images/stream_static_dwh.png b/docs/source/images/stream_static_dwh.png new file mode 100644 index 0000000..aa46870 Binary files /dev/null and b/docs/source/images/stream_static_dwh.png differ diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..8217b8c --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,28 @@ +.. Lakeflow Framework documentation master file, created by + sphinx-quickstart on Mon Nov 25 17:16:32 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Lakeflow Framework documentation +=========================== + +Add your content using ``reStructuredText`` syntax. See the +`reStructuredText `_ +documentation for details. + + +.. toctree:: + :maxdepth: 4 + :caption: Contents: + + Introduction + getting_started + Concepts + Features + deploy_framework + deploy_samples + build_pipeline_bundle + dataflow_spec_reference + orchestration + contributor + logical_environment diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst new file mode 100644 index 0000000..0e1c17a --- /dev/null +++ b/docs/source/introduction.rst @@ -0,0 +1,37 @@ +Introduction to the Lakeflow Framework +################################## + +The Lakeflow Framework is a meta-data driven, data engineering framework, designed to: + +* accelerate and simplify the deployment of Spark Declarative Pipelines (SDP), and support their deployment through your SDLC. +* support a wide variety of patterns across the medallion architecture for both batch and streaming workloads. +* provide a structured, configuration-driven approach to building reliable and maintainable data pipelines + +The Framework is designed for simplicity, performance, ease of maintenance and extensibility as the SDP product evolves. + +Core Concepts +------------- + +* **Lego block, pattern-based development** +* **Two Parts** + + * SDP wrapper components: close to the metal, exposes SDP API’s directly to minimise the need for changes. + * Dataflow Spec abstraction layer: allows users to put the SDP components together, as they needed, like Lego blocks. + +* **Key Design** + + * DABS native + * No artifacts or wheel files + * Minimized third-party dependencies + * No control tables + * Extensible + * Flexible deployment bundles + +* **OO & Best Practices** + + * Encapsulation + * Abstraction & Inheritance + * Loosely Coupled + * Separation of Concerns & Single Responsibility + +Please refer to the :doc:`concepts` section for an overview of the different components of the framework. diff --git a/docs/source/monitoring_and_observability.rst b/docs/source/monitoring_and_observability.rst new file mode 100644 index 0000000..06a94f8 --- /dev/null +++ b/docs/source/monitoring_and_observability.rst @@ -0,0 +1,4 @@ +Monitoring and Observability +============================= + +Under Construction \ No newline at end of file diff --git a/docs/source/operations.rst b/docs/source/operations.rst new file mode 100644 index 0000000..5a5d733 --- /dev/null +++ b/docs/source/operations.rst @@ -0,0 +1,4 @@ +Operations +=========== + +Under Construction \ No newline at end of file diff --git a/docs/source/orchestration.rst b/docs/source/orchestration.rst new file mode 100644 index 0000000..403d8ec --- /dev/null +++ b/docs/source/orchestration.rst @@ -0,0 +1,4 @@ +Orchestration +============= + +Under Construction \ No newline at end of file diff --git a/docs/source/patterns.rst b/docs/source/patterns.rst new file mode 100644 index 0000000..d3d0a87 --- /dev/null +++ b/docs/source/patterns.rst @@ -0,0 +1,257 @@ +Data Flow and Pipeline Patterns +############################### + +.. _patterns_overview: + +Patterns Overview +================= + +Below we summarize the core patterns that can be used to design and build out your data flows and pipelines. + +.. important:: + + The documentation for each pattern is accompanied with a data flow example. Please note that: + + * The examples are designed to relay the key differences between the various patterns + * The examples demonstrate the changes to the target tables in Append Only, SCD1 and SCD2 scenarios. + * The customer address master table only has a few basic columns so that we can keep the example simple. + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Pattern + - Description + * - :doc:`patterns_streaming_basic_1_1` + - **Suitable for:** + + Ingestion and basic 1:1 loads. + + | + **Usage Scenario:** + + * You are ingesting data or performing one-to-one loads. + * You only need to perform basic single row transforms. + + **Layers:** + + * Generally Bronze + * - :doc:`patterns_streaming_multi_source_streaming` + - **Suitable for:** + + Multi-source streaming and basic transformations. + + | + **Usage Scenario:** + + * You need to stream multiple tables in a single target table via a basic transform. + * The source tables share common business keys. + * You only need to perform basic single row transforms (e.g. enrichment). + + **Layers:** + + * Generally Silver + + **Models:** + + * 3NF such as ODS, Inmon and Enterprise Models + * Data Vault + + **Considerations & Limitations:** + + * All source tables must share the same business keys. The column names do not need to be the same in the sources, but the keys must be conceptually the same. + * In SCD 2 scenarios, a new version of a row will be generated any time data changes in any of the source streams. This will be particularly noticeable when you have late arriving records across streams and will lead to more row versions than normally expected. + * - :doc:`patterns_streaming_stream_static_basic` + - **Suitable for:** + + When you have a streaming table that you need to join to one or many additional static tables to derive your desired target data set. + + | + **Usage Scenario:** + + * You have a single streaming table driving the data flow and want to join to one or more other tables. + * You only need to reflect changes when the driving streaming table updates. + * The source tables do not share common business keys. + * You only need to perform basic single row transforms. + + **Layers:** + + * Generally Silver + * Gold (no complex transforms or aggregations) + + **Models:** + + * 3NF such as ODS, Inmon and Enterprise Models + * Data Vault + * Dimensional: dimensions and basic transactional facts + + **Considerations & Limitations:** + + * Updates in joined tables will not be reflected until a row with matching keys comes through on the driving streaming table. + * - :doc:`patterns_streaming_stream_static_streaming_dwh` + - **Suitable for:** + + When you have a streaming table that you need to join to one or many additional static tables in order to derive your desired target data set, but you also want updates to the static tables to be reflected as they occur. + + | + **Usage Scenario:** + + * You want to join multiple streaming tables. + * You want changes in any/all tables to be updated as they occur. + * You only need to perform basic single row transforms. + + **Layers:** + + * Generally Silver + * Gold (no complex transforms or aggregations) + + **Models:** + + * 3NF such as ODS, Inmon and Enterprise Models + * Data Vault + * Dimensional: dimensions and basic transactional facts + + **Considerations & Limitations:** + + * More complex to implement than the Stream-Static Basic pattern but allows for true streaming joins. + * - :doc:`patterns_streaming_cdc_stream_from_snapshot` + - **Suitable for:** + + Constructing a CDC stream from a snapshot source to be used in multi-source streaming or stream-static patterns. + + | + **Usage Scenario:** + + * You need to stream multiple sources into a single target table but one or more of the sources are snapshot based. + * You want to stream only the changes from a snapshot source. + + + +Patterns Documentation +====================== + +.. toctree:: + :maxdepth: 1 + + Pattern - Basic 1:1 + Pattern - Multi-Source Streaming + Pattern - Stream-Static - Basic + Pattern - Stream-Static - Streaming Data Warehouse + Pattern - CDC Stream from Snapshots +.. _patterns_mix_and_match: + +Multi-Source Streaming and Flow Groups +====================================== + +The :doc:`Multi-Source Streaming ` feature allows you to stream multiple flows into a single target. + +Per the :doc:`concepts` section of this documentation, Flow Groups are used to logically group flows. This is useful when you have multiple complex sources and makes data flow development and maintenance more manageable. + +You can design your pipelines with multiple flow groups, e.g if you have tables from 50 source systems streaming into one target table via a series of different transformations, you would likely design your data flow to have 50 Flow Groups, one for each source. + +The diagram below shows a data flow with two flow groups, each with their own flows, and each populating the same target table: + +.. image:: images/stream_multi_monolithic.png + :target: _images/stream_multi_monolithic.png + :alt: Monolithic Pipelines + +.. important:: + + This applies to all data flows and patterns that use Flow Groups. + +.. important:: + + Per the :doc:`concepts` section of this documentation, Flow Groups and Flows can be added and removed from a data flow as your requirements and systems evolve. This will not break the existing pipeline and will not require a full refresh of the Pipeline. + +Mix and Match +============= + +You can have one or more data flows in a single pipeline, and each of these can be based on a different pattern. + +You can also mix and match patterns in a single data flow, where you have multiple :ref:`Flow Groups ` populating the same target table; as shown below: + +.. image:: images/mix_and_match.png + :target: _images/mix_and_match.png + :alt: Mix and Match Patterns + +.. _patterns_scaling_pipelines: + +Scaling and Pipeline Scope +========================== + +When designing your data flows and pipelines, you will need to decide how you will scale and scope your data flows and pipelines to support your business requirements. + +There is no hard and fast rule in determining how to divide up your pipelines, what you choose will depend on your specific requirements and constraints. The following factors will influence your choice: + +* Your organizational structure. +* Your operational practices and your CI/CD processes. +* The size and complexity of your data e.g. the number of sources, transformations, targets and volumes. +* Your latency requirements and your SLA's. +* and many more ... + +Ultimately you will need to determine the best way to divide up your pipelines to support your business requirements. + +.. important:: + + Per the :doc:`concepts` section of this documentation: + + * A data flow, and its Data Flow Spec, defines the source(s) and logic required to generate a **single target table**. + * A Pipeline Bundle can contain multiple Data Flow Specs, and a Pipeline deployed by the bundle may execute the logic for one or more Data Flow Specs. + + For the above reasons **the smallest possible division for a Pipeline is a single data flow and hence a single target table**. + +.. warning:: + + Be aware of the current Pipeline and concurrency limits for DLT. These are subject to change and you can check the latest limits at: + + * https://docs.databricks.com/en/resources/limits.html + * https://docs.databricks.com/en/delta-live-tables/limitations.html + +Pipeline Scope +--------------- + +The most common strategy is to logically group your target tables as a starting point and then determine your pipeline scope from there. +Some of the most common groupings are shown below: + +.. list-table:: + :widths: 30 70 + :header-rows: 1 + + * - Logical Grouping + - Description + * - Use Case + - You may choose to have an end to end pipeline for given Use Cases + * - Bronze + - * Source System - A Pipeline per Source System or application + * - Silver / Enterprise Models + - * Subject Area / Sub-Domain - A Pipeline per Subject Area, or Sub-Domain + * Use Case - A Pipeline per Use Case + * Target Table - A Pipeline per target table, the most granular level for complex data flows + * - Gold / Dimensional Models + - * Data Mart - A Pipeline per Data Mart + * Common Dimensions - A Pipeline for your Common Dimensions + * Target Table - A Pipeline for complex Facts or target tables. + +Once you have determined the best way to divide up your pipelines, you can then determine the best way to implement them, which will fall into one of the following categories: + +Decomposing Pipelines +--------------------- + +You can break a pipeline down into smaller, more manageable pipelines where natural boundaries exist. + +In the below example, we start with a pipeline that has two Flow Groups flowing into a target table, via some staging tables: + +.. image:: images/stream_multi_monolithic.png + :target: _images/stream_multi_monolithic.png + :alt: Atomic Pipelines + +Below is the same pipeline decomposed into three pipelines: + +* Each Flow Group has been broken out into a separate pipeline, the target table of which is the final staging table. +* There is a final pipeline that merges the up stream staging tables into the final target table. + +.. image:: images/stream_multi_granular.png + :target: _images/stream_multi_granular.png + :alt: Decomposed Pipelines + + diff --git a/docs/source/patterns_streaming_basic_1_1.rst b/docs/source/patterns_streaming_basic_1_1.rst new file mode 100644 index 0000000..3f6d1ae --- /dev/null +++ b/docs/source/patterns_streaming_basic_1_1.rst @@ -0,0 +1,215 @@ +Pattern - Basic 1:1 +==================== + +Description +------------ +Suitable for ingestion and basic 1:1 loads. +Use when: + +- You are ingesting data or performing one-to-one loads. +- You only need to perform basic single row transforms. + +**Layers:** Generally Bronze + +**Data Flow Components:** + +.. image:: images/basic_1_1.png + :target: _images/basic_1_1.png + :alt: Basic 1:1 + +.. list-table:: + :header-rows: 1 + + * - No. + - Component + - Description + - M / O + * - 1 + - Input View + - Input view created over the streaming source table. This view can optionally read from CDF if the source table is CDF enabled. + - M + * - 2 + - Flow + - Append or Change flow to streaing target table. + - M + * - 3 + - Target Table + - A streaming table, the schema of which is specified in the dataflowspec. + - M + +Feature Support +---------------- + +.. list-table:: + :header-rows: 1 + + * - Supported + - Not Supported + * - * Append Only & SCD 1/2 + * Basic transforms such as: + + * Data type conversion + * Concatenation + * Single row calculations + * Formatting + + * Cleansing & Data Quality Rules + - * Complex transforms + * Joins + * Multiple streaming sources + * Window By + +Sample +------ +- Bundle: ``dlt_framework/src/samples/bronze_sample`` + +Example Data Flow +------------------ + +Day 1 Load +~~~~~~~~~~ + +* **Source Table (Append-Only)** + + CUSTOMER + + .. list-table:: + :header-rows: 1 + + * - customer_id + - first_name + - last_name + - email + - load_timestamp + * - 1 + - John + - Doe + - john.doe@example.com + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - jane.smith@example.com + - 2023-01-01 10:00 + +* **Target Table** + + **Append-Only Scenario** + + .. list-table:: + :header-rows: 1 + + * - customer_id + - first_name + - last_name + - email + - load_timestamp + * - 1 + - John + - Doe + - john.doe@example.com + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - jane.smith@example.com + - 2023-01-01 10:00 + + **SCD1 Scenario** + + .. list-table:: + :header-rows: 1 + + * - customer_id + - first_name + - last_name + - email + - load_timestamp + * - 1 + - John + - Doe + - john.doe@example.com + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - jane.smith@example.com + - 2023-01-01 10:00 + + **SCD2 Scenario** + + .. list-table:: + :header-rows: 1 + + * - customer_id + - first_name + - last_name + - email + - _START_AT + - _END_AT + * - 1 + - John + - Doe + - john.doe@example.com + - 2023-01-01 10:00 + - NULL + * - 2 + - Jane + - Smith + - jane.smith@example.com + - 2023-01-01 10:00 + - NULL + +Day 2 Load +~~~~~~~~~~ + +* **Source Table (Append-Only)** + + CUSTOMER + + .. raw:: html + +
+ + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ +* **Target Table** + + **Append-Only Scenario** + + .. raw:: html + + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + **SCD1 Scenario** + + .. raw:: html + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe jdoe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + **SCD2 Scenario** + + .. raw:: html + + + + + + > + +
customer_id first_name last_name email _START_AT _END_AT
1 John Doe jdoe@example.com 2023-01-02 10:00 NULL
1 John Doe john.doe@example.com 2023-01-01 10:00 2023-01-02 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-01 10:00 NULL
4 Joe Bloggs joe.bloggs@example.com 2023-01-01 10:00 NULL
\ No newline at end of file diff --git a/docs/source/patterns_streaming_cdc_stream_from_snapshot.rst b/docs/source/patterns_streaming_cdc_stream_from_snapshot.rst new file mode 100644 index 0000000..b60d9d0 --- /dev/null +++ b/docs/source/patterns_streaming_cdc_stream_from_snapshot.rst @@ -0,0 +1,220 @@ +Construct CDC Stream from Snapshot Source +========================================== + +Description +------------ +Suitable for multi-source streaming and stream-static patterns where one or more sources are snapshot based. + +Use when: + +- You have more than one snapshot sources and want to combine and stream changes to a target table. +- You have a combination of snapshot and cdc sources and want to combine and stream changes to a target table. + +.. note:: + - This pattern is not intended to be used in isolation. It is intended to convert snapshot sources into a CDC stream that can be used as part of a multi-source streaming or stream-static patterns. + +**Layers:** + +- Bronze (Construct CDC stream) +- Silver (Use CDC stream) + +Data Flow Components: +--------------------- + +.. image:: images/cdc_stream_from_snapshot.png + :target: _images/cdc_stream_from_snapshot.png + :alt: CDC Stream from Snapshot + +.. list-table:: + :header-rows: 1 + :widths: 5 20 55 20 + + * - No. + - Component + - Description + - M / O + * - 1 + - Input View + - Input view created over the snapshot source. In incremental mode this is a physical SDP view created over the source that can look different at any given time representing the latest state of the source. In historical mode, this is a logical component as instead it'd be configured to automatically get the next snapshot as the source. + - M + * - 2 + - Change Flow + - An SCD1 Change Flow that streams changes from the snapshot source to a staging table. This needs to be an SCD1 to allow for physical deletes in the snapshot source to be propagated to the CDF as a delete operation, SCD2 does not support this. + - M + * - 3 + - Staging Table + - A staging table with CDF enabled. + - M + * - 4 + - View + - A view over the staging table that reads a stream from the staging table's CDF. This will be the CDC stream of the snapshot source. This view can now be an input view in any of the :doc:`Multi-Source Streaming ` patterns below. + - M + +\* M / O: Mandatory or Optional. + +Feature Support +--------------- + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - Supported + - Not Supported + * - - Incremental and historical snapshot modes + - Physical deletes in source propagated as CDF deletes + - Basic transforms such as: + - Data type conversion + - Column selection and renaming + - Single row calculations + - Data quality rules + - - Complex transformations + - Window functions + - Aggregations + - SCD2 staging tables + +Considerations and Limitations +--------------------------- + +.. important:: + - In historical mode, if there are multiple snapshots processed in the first run, reading stream from the CDF of the staging table will only return the lastest snapshot's records as inserts. To get all the changes from all historical snapshots, set startingVersionFromDLTSetup to true when reading the CDF of the staging table, see :doc:`dataflow_spec_ref_source_details`. + +Samples +------- +Construct CDC stream from snapshot source in bronze: + +- Bundle: ``samples/bronze_sample`` +- Sample: ``samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_datetime_scd1_main.json`` + +Use CDC stream as input view in silver: + +- Bundle: ``samples/silver_sample`` +- Sample: ``samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_snapshot_source_main.json`` + +Example Data Flow +--------------- + +The sample demonstrates converting a snapshot source into a CDC stream: + +1. Source snapshot table "customer" is configured with incremental mode +2. CDC from Snapshot (Change Flow) detects changes between snapshots +3. Changes are written to staging table in SCD1 mode with CDF enabled +4. Final view reads CDF stream from staging table +5. CDC stream can now be used as input to other streaming patterns + +Day 1 Load +~~~~~~~~~~ + +* **Source Table (Snapshot)** + + CUSTOMER + + .. raw:: html + + + + +
customer_id first_name last_name email updated_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
+ +* **Staging Table (SCD1)** + + CUSTOMER Staging Table + + .. raw:: html + + + + +
customer_id first_name last_name email updated_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
+ + CDF of CUSTOMER Staging Table + + .. raw:: html + + + + +
customer_id first_name last_name email updated_timestamp _change_type _commit_version _commit_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00 insert 1 2023-01-01 18:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00 insert 1 2023-01-01 18:00
+ +Day 2 Load +~~~~~~~~~~ + +* **Source Table (Snapshot)** + + CUSTOMER + + .. raw:: html + + + + + +
customer_id first_name last_name email updated_timestamp
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ +* **Staging Table (SCD1)** + + CUSTOMER Staging Table + + .. raw:: html + + + + + +
customer_id first_name last_name email updated_timestamp
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + + CDF of CUSTOMER Staging Table + + .. raw:: html + + + + + + + + +
customer_id first_name last_name email updated_timestamp _change_type _commit_version _commit_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00 insert 1 2023-01-01 18:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00 insert 1 2023-01-01 18:00
1 John Doe jdoe@example.com 2023-01-02 10:00 update 2 2023-01-02 18:00
2 Jane Smith jane.smith@example.com 2023-01-02 10:00 delete 2 2023-01-02 18:00
3 Alice Green alice.green@example.com 2023-01-02 10:00 insert 2 2023-01-02 18:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00 insert 2 2023-01-02 18:00
+ + +Day 3 Load +~~~~~~~~~~ + +* **Source Table (Snapshot)** + + CUSTOMER + + .. raw:: html + + + + +
customer_id first_name last_name email updated_timestamp
1 John Doe jdoe@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-03 10:00
+ +* **Staging Table (SCD1)** + + CUSTOMER Staging Table + + .. raw:: html + + + + +
customer_id first_name last_name email updated_timestamp
1 John Doe jdoe@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-03 10:00
+ + + CDF of CUSTOMER Staging Table + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name email updated_timestamp _change_type _commit_version _commit_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00 insert 1 2023-01-01 18:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00 insert 1 2023-01-01 18:00
1 John Doe jdoe@example.com 2023-01-02 10:00 update 2 2023-01-02 18:00
2 Jane Smith jane.smith@example.com 2023-01-02 10:00 delete 2 2023-01-02 18:00
3 Alice Green alice.green@example.com 2023-01-02 10:00 insert 2 2023-01-02 18:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00 insert 2 2023-01-02 18:00
3 Alice Green alice.green@example.com 2023-01-03 10:00 delete 3 2023-01-03 18:00
+ diff --git a/docs/source/patterns_streaming_flow_groups.rst b/docs/source/patterns_streaming_flow_groups.rst new file mode 100644 index 0000000..a2fa7a7 --- /dev/null +++ b/docs/source/patterns_streaming_flow_groups.rst @@ -0,0 +1,18 @@ +Multi-Source Streaming and Flow Groups +###################################### + +The :doc:`Multi-Source Streaming ` feature allows you to stream multiple flows into a single target. + +Per the :doc:`Concepts ` section of this documentation, Flow Groups are used to logically group flows. This is useful when you have multiple complex sources and makes data flow development and maintenance more manageable. + +You can design your pipelines with multiple flow groups, e.g if you have tables from 50 source systems streaming into one target table via a series of different transformations, you would likely design your data flow to have 50 Flow Groups, one for each source. + +The diagram below shows a data flow with two flow groups, each with their own flows, and each populating the same target table: + +.. image:: images/stream_multi_monolithic.png + :target: _images/stream_multi_monolithic.png + :alt: Monolithic Pipelines + +.. important:: + + Per the :ref:`concepts` section of this documentation, Flow Groups and Flows can be added and removed from a data flow as your requirements and systems evolve. This will not break the existing pipeline and will not require a full refresh of the Pipeline. \ No newline at end of file diff --git a/docs/source/patterns_streaming_multi_source_streaming.rst b/docs/source/patterns_streaming_multi_source_streaming.rst new file mode 100644 index 0000000..272687d --- /dev/null +++ b/docs/source/patterns_streaming_multi_source_streaming.rst @@ -0,0 +1,475 @@ +Multi-Source Streaming +======================= + +Description +------------ +Suitable for multi-source streaming with basic transformations. +Use when: + +- You need to stream multiple tables in a single target table via a basic transform. +- The source tables share common business keys. +- You only need to perform basic single row transforms. + +**Layers:** Generally Silver + +**Models:** + +- 3NF such as ODS, Inmon and Enterprise Models +- Data Vault + +**Data Flow Components:** + +.. image:: images/stream_multi_base.png + :target: _images/stream_multi_base.png + :alt: Multi-Source Streaming - Base + +.. list-table:: + :header-rows: 1 + :widths: 5 15 60 20 + + * - No. + - Component + - Description + - M / O + * - 1 + - Input Views + - Input views are created over each streaming source table (as many as required). All source tables must share a common set of PK's and must contain a sequence by column of the same data type. These views can optionally read from CDF if the source tables are CDF enabled. + - M + * - 2 + - Append Flows + - Append flows load the rows from each streaming source table into a staging table. + - M + * - 3 + - Staging Append Only Table + - A streaming append only table, the schema of which consists of the common primary keys, sequence by and the data columns returned by each input view. + - M + * - 4 + - Change Flow + - A single change flow loads the data into the staging merge table. It essentially merges and dedupes all the rows on the common PK's. + - M + * - 5 + - Staging Merge Table + - A streaming table, the schema of which consists of the common primary keys, sequence by and the data columns returned by each input view. CDF is enabled on this table. + - M + * - 6 + - Final CDF View + - A view over the staging merge table that reads a stream from the merge tables change data feed. + - M + * - 7 + - Final Transform View + - A view that applies a SQL transform (SELECT or CTE) to the data returned by the Final CDF View. This is optional and not required if no transformation needs to be applied. If you don't have a transform requirement you can omit the transform view. You may for example only need to specify which columns you want or perform a basic column renaming, which you can do in the Final CDF View (component 6). + - O + * - 8 + - Append or Change Flow + - An Append Flow (for transactional or fact based target tables) or an SCD1/2 Change Flow that loads the data into the final target table. + - M + * - 9 + - Target Table + - A streaming table, the schema of which is specified in the dataflowspec. This table is the final target table for the given flow. + - M + +\* M / O: Mandatory or Optional. + +Feature Support +--------------- + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - Supported + - Not Supported + * - * Append Only & SCD 1/2 + * Basic transforms such as: + + * Data type conversion + * Concatenation + * Single row calculations + * Formatting + + * Cleansing & Data Quality Rules + * Conditionals and calculations (single row) across multiple source tables + - * Complex transforms such as aggregations + * Joins + * Window By + +Considerations and Limitations +------------------------------ + +.. important:: + + - All source tables must share the same business keys. The column names do not need to be the same in the sources, but the keys must be conceptually the same. + - In SCD 2 scenarios, a new version of a row will be generated any time data changes in any of the source streams. This will be particularly noticeable when you have late arriving records across streams and will lead to more row versions than normally expected. + + +Sample +------ +- Bundle: ``dlt_framework/src/samples/silver_sample`` +- Sample: ``dlt_framework/src/samples/silver_sample/src/dataflows/customer_p4`` + + +Example Data Flow +------------------ + +.. note:: + + In the below example data flow, customer ID 4 demonstrates the behaviour for late arriving records in streaming sources. This ultimately means you will potentially have more versions for a row in SCD2 scenarios. This needs to be weighed against your requirements and will either be acceptable or not. If this is not acceptable, please refer to the stream-static patterns below. + +Day 1 Load +~~~~~~~~~~ + +* **Source Tables (Append-Only)** + + CUSTOMER + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 25 30 + + * - customer_id + - first_name + - last_name + - email + - load_timestamp + * - 1 + - John + - Doe + - john.doe@example.com + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - jane.smith@example.com + - 2023-01-01 10:00 + + CUSTOMER_ADDRESS + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 30 + + * - customer_id + - city + - state + - load_timestamp + * - 1 + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 2 + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 4 + - Hobart + - TAS + - 2023-01-01 10:00 + +* **Staging Table (stg_source_1_appnd)** + +.. list-table:: + :header-rows: 1 + :widths: 15 15 15 25 15 15 30 + + * - customer_id + - first_name + - last_name + - email + - city + - state + - load_timestamp + * - 1 + - John + - Doe + - john.doe@example.com + - NULL + - NULL + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - jane.smith@example.com + - NULL + - NULL + - 2023-01-01 10:00 + * - 1 + - NULL + - NULL + - NULL + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 2 + - NULL + - NULL + - NULL + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 4 + - NULL + - NULL + - NULL + - Hobart + - TAS + - 2023-01-01 10:00 + +* **Staging Table (stg_source_1_mrg)** + +.. list-table:: + :header-rows: 1 + :widths: 15 15 15 25 15 15 25 25 + + * - customer_id + - first_name + - last_name + - email + - city + - state + - _START_AT + - _END_AT + * - 1 + - John + - Doe + - john.doe@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + * - 2 + - Jane + - Smith + - jane.smith@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + * - 4 + - NULL + - NULL + - NULL + - Hobart + - TAS + - 2023-01-01 10:00 + - NULL + +* Target Table + + * **Append-Only Scenario** + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 15 25 15 15 30 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + - load_timestamp + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 4 + - NULL + - NULL + - NULL + - NULL + - Hobart + - TAS + - 2023-01-01 10:00 + + * SCD1 Scenario + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 15 25 15 15 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + * - 4 + - NULL + - NULL + - NULL + - NULL + - Hobart + - TAS + + * SCD2 Scenario + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 15 25 15 15 25 25 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + - _START_AT + - _END_AT + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + * - 4 + - NULL + - NULL + - NULL + - NULL + - Hobart + - TAS + - 2023-01-01 10:00 + - NULL + +Day 2 Load +~~~~~~~~~~ + +* **Source Tables (Append-Only)** + + CUSTOMER + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + CUSTOMER_ADDRESS + + .. raw:: html + + + + + + + +
customer_id city state load_timestamp
1 Melbourne VIC 2023-01-01 10:00
2 Melbourne VIC 2023-01-01 10:00
4 Hobart TAS 2023-01-01 10:00
2 Perth WA 2023-01-02 10:00
3 Sydney NSW 2023-01-02 10:00
+* **Staging Table (stg_source_1_appnd)** + + .. raw:: html + + + + + + + + + + + + +
customer_id first_name last_name email city state load_timestamp
1 John Doe john.doe@example.com NULL NULL 2023-01-01 10:00
2 Jane Smith jane.smith@example.com NULL NULL 2023-01-01 10:00
1 NULL NULL NULL Melbourne VIC 2023-01-01 10:00
2 NULL NULL NULL Melbourne VIC 2023-01-01 10:00
4 NULL NULL NULL Hobart TAS 2023-01-01 10:00
1 John Doe jdoe@example.com NULL NULL 2023-01-02 10:00
3 Alice Green alice.green@example.com NULL NULL 2023-01-02 10:00
2 NULL NULL NULL Perth WA 2023-01-02 10:00
3 NULL NULL NULL Sydney NSW 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com NULL NULL 2023-01-02 10:00
+ +* **Staging Table (stg_source_1_mrg)** + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name email city state _START_AT _END_AT
1 John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00 NULL
1 John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
2 Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00 NULL
2 Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green alice.green@example.com Sydney NSW 2023-01-01 10:00 NULL
4 Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00 NULL
4 NULL NULL NULL Hobart TAS 2023-01-01 10:00 2023-01-02 10:00
+ +* **Target Table** + + * Append-Only Scenario + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name full_name email city state load_timestamp
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00
4 NULL NULL NULL NULL Hobart TAS 2023-01-01 10:00
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW 2023-01-02 10:00
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00
+ + * SCD1 Scenario + + .. raw:: html + + + + + + +
customer_id first_name last_name full_name email city state
1 John Doe John Doe jdoe@example.com Melbourne VIC
2 Jane Smith Jane Smith jane.smith@example.com Perth WA
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS
+ + * SCD2 Scenario + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name full_name email city state _START_AT _END_AT
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00 NULL
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00 NULL
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green Alice Green alice.green@example.com Sydney NSW 2023-01-01 10:00 NULL
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00 NULL
4 NULL NULL NULL NULL Hobart TAS 2023-01-01 10:00 2023-01-02 10:00
\ No newline at end of file diff --git a/docs/source/patterns_streaming_stream_static_basic.rst b/docs/source/patterns_streaming_stream_static_basic.rst new file mode 100644 index 0000000..6f9e8d0 --- /dev/null +++ b/docs/source/patterns_streaming_stream_static_basic.rst @@ -0,0 +1,466 @@ +Stream-Static Basic +==================== + +Description +------------ +Suitable for when you have a streaming table that you need to join to one or many additional static tables to derive your desired target data set. + +Use when: + +- You have a single streaming table driving the data flow and want to join to one or more other tables. +- You only need to reflect changes when the driving streaming table updates. +- The source tables do not share common business keys. +- You only need to perform basic single row transforms. + +**Layers:** + +- Silver +- Gold (no complex transforms or aggregations) + +**Models:** + +- 3NF such as ODS, Inmon and Enterprise Models +- Data Vault +- Dimensional: dimensions and basic transactional facts + +Data Flow Components: +--------------------- + +.. image:: images/stream_static_basic.png + :target: _images/stream_static_basic.png + :alt: Stream Static - Basic + +.. list-table:: + :header-rows: 1 + :widths: 5 20 55 20 + + * - No. + - Component + - Description + - M / O + * - 1 + - Input View + - Input view created over the streaming source table that will ultimately static join to one or more additional source tables, in the next view below. This view can optionally read from CDF if the source table is CDF enabled. + - M + * - 2 + - View + - A view that defines SQL, joining the input view above to one or more additional source tables. + - M + * - 3 + - Append or Change Flow + - An Append Flow (for transactional or fact based target tables) or an SCD1/2 Change Flow that loads the data into the final target table. + - M + * - 4 + - Target Table + - A streaming table, the schema of which is specified in the dataflowspec. This table is the final target table for the given flow. + - M + +\* M / O: Mandatory or Optional. + +Feature Support +--------------- + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - Supported + - Not Supported + * - - Append Only & SCD 1/2 + - Basic transforms such as: + - Data type conversion + - Concatenation + - Single row calculations + - Formatting + - Cleansing & Data Quality Rules + - Conditionals and calculations (single row) across multiple source tables + - Joins + - - Complex transforms such as aggregations + - Window By + +Considerations and Limitations +--------------------------- + +.. important:: + - Updates in joined tables will not be reflected until a row with matching keys comes through on the driving streaming table. + +Sample +------ +- Bundle: ``dlt_framework/src/samples/silver_sample`` +- Sample: ``dlt_framework/src/samples/silver_sample/src/dataflows/stream_static_p6`` + +Example Data Flow +--------------- + +.. important:: + - The Day 3 load below demonstrates a limitation of this pattern. Updates in any of the static tables will only be loaded once corresponding rows (with the same PK's) in the primary streaming table are updated. This is resolved by the :doc:`Streaming Data Warehouse ` pattern below. + + Note this behavior may be perfectly acceptable in a given scenario, which is why this pattern remains relevant, as it is a simpler implementation than the :doc:`Streaming Data Warehouse ` pattern + +Day 1 Load +~~~~~~~~~~ + +* **Source Tables (Append-Only)** + + CUSTOMER + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 25 30 + + * - customer_id + - first_name + - last_name + - email + - load_timestamp + * - 1 + - John + - Doe + - john.doe@example.com + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - jane.smith@example.com + - 2023-01-01 10:00 + + CUSTOMER_ADDRESS + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 30 + + * - customer_id + - city + - state + - load_timestamp + * - 1 + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 2 + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 4 + - Hobart + - TAS + - 2023-01-01 10:00 + +* Target Table + + * **Append-Only Scenario** + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 15 25 15 15 30 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + - load_timestamp + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + + * SCD1 Scenario + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 15 25 15 15 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + + * SCD2 Scenario + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 15 25 15 15 25 25 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + - _START_AT + - _END_AT + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + +Day 2 Load +~~~~~~~~~~ + +* **Source Tables (Append-Only)** + + CUSTOMER + + .. raw:: html + + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + CUSTOMER_ADDRESS + + .. raw:: html + + + + + + + +
customer_id city state load_timestamp
1 Melbourne VIC 2023-01-01 10:00
2 Melbourne VIC 2023-01-01 10:00
4 Hobart TAS 2023-01-01 10:00
2 Perth WA 2023-01-02 10:00
3 Sydney NSW 2023-01-02 10:00
+ +**Target Table** + + - Append-Only Scenario + + .. raw:: html + + + + + + + + +
customer_id first_name last_name full_name email city state load_timestamp
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW 2023-01-02 10:00
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00
+ + - SCD1 Scenario + + .. raw:: html + + + + + + +
customer_id first_name last_name full_name email city state
1 John Doe John Doe jdoe@example.com Melbourne VIC
2 Jane Smith Jane Smith jane.smith@example.com Perth WA
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS
+ + - SCD2 Scenario + + .. raw:: html + + + + + + + + +
customer_id first_name last_name full_name email city state _START_AT _END_AT
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00 NULL
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00 NULL
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green Alice Green alice.green@example.com Sydney NSW 2023-01-01 10:00 NULL
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00 NULL
+ + .. note:: + + - Customer 1's change of location to Brisbane did not propagate as there was no corresponding new row in the primary Customer table on this Day. + - This change in location will only be reflected once a new row for customer 1 is loaded into the Customer source table. + +Day 3 Load +~~~~~~~~~~ + +* **Source Tables (Append-Only)** + + CUSTOMER + + .. raw:: html + + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + CUSTOMER_ADDRESS + + .. raw:: html + + + + + + + + +
customer_id city state load_timestamp
1 Melbourne VIC 2023-01-01 10:00
2 Melbourne VIC 2023-01-01 10:00
4 Hobart TAS 2023-01-01 10:00
2 Perth WA 2023-01-02 10:00
3 Sydney NSW 2023-01-02 10:00
1 Brisbane QLD 2023-01-03 10:00
+ +* **Target Table** + + - Append-Only Scenario + + .. raw:: html + + + + + + + + +
customer_id first_name last_name full_name email city state load_timestamp
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW 2023-01-02 10:00
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00
+ + - SCD1 Scenario + + .. raw:: html + + + + + + +
customer_id first_name last_name full_name email city state
1 John Doe John Doe jdoe@example.com Melbourne VIC
2 Jane Smith Jane Smith jane.smith@example.com Perth WA
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS
+ + - SCD2 Scenario + + .. raw:: html + + + + + + + + +
customer_id first_name last_name full_name email city state _START_AT _END_AT
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00 NULL
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00 NULL
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green Alice Green alice.green@example.com Sydney NSW 2023-01-01 10:00 NULL
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00 NULL
+ + .. note:: + + - Customer 1's change of location to Brisbane did not propagate as there was no corresponding new row in the primary Customer table on this Day. + - This change in location will only be reflected once a new row for customer 1 is loaded into the Customer source table. + +Day 4 Load +~~~~~~~~~~ + +* **Source Tables (Append-Only)** + + CUSTOMER + + .. raw:: html + + + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
1 John Doe john.doe@another.example.com 2023-01-04 10:00
+ + CUSTOMER_ADDRESS + + .. raw:: html + + + + + + + + +
customer_id city state load_timestamp
1 Melbourne VIC 2023-01-01 10:00
2 Melbourne VIC 2023-01-01 10:00
4 Hobart TAS 2023-01-01 10:00
2 Perth WA 2023-01-02 10:00
3 Sydney NSW 2023-01-02 10:00
1 Brisbane QLD 2023-01-03 10:00
+ +* **Target Table** + + - Append-Only Scenario + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name full_name email city state load_timestamp
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW 2023-01-02 10:00
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00
1 John Doe John Doe john.doe@another.example.com Brisbane QLD 2023-01-04 10:00
+ + - SCD1 Scenario + + .. raw:: html + + + + + + +
customer_id first_name last_name full_name email city state
1 John Doe John Doe john.doe@another.example.com Brisbane QLD
2 Jane Smith Jane Smith jane.smith@example.com Perth WA
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS
+ + - SCD2 Scenario + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name full_name email city state _START_AT _END_AT
1 John Doe John Doe john.doe@another.example.com Brisbane QLD 2023-01-04 10:00 NULL
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00 2023-01-04 10:00
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00 NULL
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green Alice Green alice.green@example.com Sydney NSW 2023-01-01 10:00 NULL
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00 NULL
+ + .. note:: + + Both the change of address on day 3 and the change of email on day 4 come through on this load. \ No newline at end of file diff --git a/docs/source/patterns_streaming_stream_static_streaming_dwh.rst b/docs/source/patterns_streaming_stream_static_streaming_dwh.rst new file mode 100644 index 0000000..4a5ec04 --- /dev/null +++ b/docs/source/patterns_streaming_stream_static_streaming_dwh.rst @@ -0,0 +1,477 @@ +Stream-Static - Streaming Data Warehouse +========================================= + +Description +------------ +Suitable for when you have a streaming table that you need to join to one or many additional static tables in order to derive your desired target data set, but you also want updates to the static tables to be reflected as they occur. + +Use when: + +- You want to join multiple streaming tables. +- You want changes in any/all tables to be updated as they occur. +- You only need to perform basic single row transforms. + +**Layers:** + +- Silver +- Gold (no complex transforms or aggregations) + +**Models:** + +- 3NF such as ODS, Inmon and Enterprise Models +- Data Vault +- Dimensional: dimensions and basic transactional facts + +**Data Flow Components:** + +.. image:: images/stream_static_dwh.png + :target: _images/stream_static_dwh.png + :alt: Stream Static - Streaming DWH + +.. list-table:: + :header-rows: 1 + :widths: 5 15 60 20 + + * - No. + - Component + - Description + - M / O + * - 1 + - Input Views + - Input views are created over each streaming source table (as many as required). These views need only return: + + - the columns required for the necessary joins + - a Sequence By column if this is an SCD 1/2 use case + + These views can optionally read from CDF if the source tables are CDF enabled. + - M + * - 2 + - Append Flows + - Append flows load only the PK's and Sequnce By columns into a staging table. + - M + * - 3 + - Staging Append Only Table + - A streaming append only table, the schema of which consists of only the primary keys and sequence by columns returned by each input view. + - M + * - 4 + - Change Flow + - A single change flow loads the data into the staging merge table. It essentially merges and dedupes all the rows. + - M + * - 5 + - Staging Merge Table + - A streaming table, the schema of which consists of only the primary keys and sequence by columns. CDF is enabled on this table. + - M + * - 6 + - Stream-static Join View + - A view that implements the frameworks delta-join source type. It uses the previous staging table as the driving streaming table, reading from its CDF feed, and performs static joins to ALL the tables defined in the join. + - M + * - 7 + - Final Transform View + - A view that applies a SQL transform (SELECT or CTE) to the data returned by the Stream-static Join View. This is optional and not required if no transformation needs to be applied. If you don't have a transform requirement you can omit the transform view. You may for example only need to specify which columns you want or perform a basic column renaming, which you can do in the Stream-static Join View (component 6). + - O + * - 8 + - Append or Change Flow + - An Append Flow (for transactional or fact based target tables) or an SCD1/2 Change Flow that loads the data into the final target table. + - M + * - 9 + - Target Table + - A streaming table, the schema of which is specified in the dataflowspec. This table is the final target table for the given flow. + - M + +Feature Support +---------------- + +.. list-table:: + :header-rows: 1 + :widths: 50 50 + + * - Supported + - Not Supported + * - - Append Only & SCD 1/2 + - Basic transforms such as: + + - Data type conversion + - Concatenation + - Single row calculations + - Formatting + + - Cleansing & Data Quality Rules + - Conditionals and calculations (single row) across multiple source tables + - Joins + - - Complex transforms such as aggregations + - Window By + +Considerations and Limitations +------------------------------- +- More complex to implement than the Stream-Static Basic pattern but allows for true streaming joins. + +Sample +------ +- Bundle: ``dlt_framework/src/samples/silver_sample`` +- Sample: ``dlt_framework/src/samples/silver_sample/src/dataflows/stream_static_p7`` + +Example Data Flow +------------------ + +Day 1 Load +~~~~~~~~~~ +- **Source Tables (Append-Only)** + + CUSTOMER + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 25 30 + + * - customer_id + - first_name + - last_name + - email + - load_timestamp + * - 1 + - John + - Doe + - john.doe@example.com + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - jane.smith@example.com + - 2023-01-01 10:00 + + CUSTOMER_ADDRESS + + .. list-table:: + :header-rows: 1 + :widths: 15 15 15 30 + + * - customer_id + - city + - state + - load_timestamp + * - 1 + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 2 + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 4 + - Hobart + - TAS + - 2023-01-01 10:00 + +- **Staging Table (stg_source_1_appnd)** + + .. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - customer_id + - load_timestamp + * - 1 + - 2023-01-01 10:00 + * - 2 + - 2023-01-01 10:00 + * - 1 + - 2023-01-01 10:00 + * - 2 + - 2023-01-01 10:00 + * - 4 + - 2023-01-01 10:00 + +- **Target Table** + + - Append-Only Scenario + + .. list-table:: + :header-rows: 1 + :widths: 12 12 12 12 20 12 10 10 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + - load_timestamp + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + + - SCD1 Scenario + + .. list-table:: + :header-rows: 1 + :widths: 12 12 12 12 20 12 10 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + + - SCD2 Scenario + + .. list-table:: + :header-rows: 1 + :widths: 12 12 12 12 20 12 10 15 15 + + * - customer_id + - first_name + - last_name + - full_name + - email + - city + - state + - _START_AT + - _END_AT + * - 1 + - John + - Doe + - John Doe + - john.doe@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + * - 2 + - Jane + - Smith + - Jane Smith + - jane.smith@example.com + - Melbourne + - VIC + - 2023-01-01 10:00 + - NULL + +Day 2 Load +~~~~~~~~~~~ +- **Source Tables (Append-Only)** + + CUSTOMER + + .. raw:: html + + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + CUSTOMER_ADDRESS + + .. raw:: html + + + + + + + +
customer_id city state load_timestamp
1 Melbourne VIC 2023-01-01 10:00
2 Melbourne VIC 2023-01-01 10:00
4 Hobart TAS 2023-01-01 10:00
2 Perth WA 2023-01-02 10:00
3 Sydney NSW 2023-01-02 10:00
+ +- **Staging Table (stg_source_1_appnd)** + + .. raw:: html + + + + + + + + + + + + +
customer_id load_timestamp
1 2023-01-01 10:00
2 2023-01-01 10:00
1 2023-01-01 10:00
2 2023-01-01 10:00
4 2023-01-01 10:00
1 2023-01-02 10:00
3 2023-01-02 10:00
2 2023-01-02 10:00
3 2023-01-02 10:00
4 2023-01-02 10:00
+ +- **Staging Table (stg_source_1_mrg)** + + .. raw:: html + + + + + + + + + +
customer_id _START_AT _END_AT
1 2023-01-02 10:00 NULL
1 2023-01-01 10:00 2023-01-02 10:00
2 2023-01-02 10:00 NULL
2 2023-01-01 10:00 2023-01-02 10:00
3 2023-01-01 10:00 NULL
4 2023-01-02 10:00 NULL
4 2023-01-01 10:00 2023-01-02 10:00
+ +- **Target Table** + + - Append-Only Scenario + + .. raw:: html + + + + + + + + +
customer_id first_name last_name full_name email city state load_timestamp
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW 2023-01-02 10:00
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00
+ + - SCD1 Scenario + + .. raw:: html + + + + + + +
customer_id first_name last_name full_name email city state
1 John Doe John Doe jdoe@example.com Melbourne VIC
2 Jane Smith Jane Smith jane.smith@example.com Perth WA
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS
+ + - SCD2 Scenario + + .. raw:: html + + + + + + + + +
customer_id first_name last_name full_name email city state _START_AT _END_AT
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00 NULL
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00 NULL
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green Alice Green alice.green@example.com Sydney NSW 2023-01-01 10:00 NULL
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00 NULL
+ +Day 3 Load +~~~~~~~~~~~ +- **Source Tables (Append-Only)** + + CUSTOMER + + .. raw:: html + + + + + + + +
customer_id first_name last_name email load_timestamp
1 John Doe john.doe@example.com 2023-01-01 10:00
2 Jane Smith jane.smith@example.com 2023-01-01 10:00
1 John Doe jdoe@example.com 2023-01-02 10:00
3 Alice Green alice.green@example.com 2023-01-02 10:00
4 Joe Bloggs joe.bloggs@example.com 2023-01-02 10:00
+ + CUSTOMER_ADDRESS + + .. raw:: html + + + + + + + + +
customer_id city state load_timestamp
1 Melbourne VIC 2023-01-01 10:00
2 Melbourne VIC 2023-01-01 10:00
4 Hobart TAS 2023-01-01 10:00
2 Perth WA 2023-01-02 10:00
3 Sydney NSW 2023-01-02 10:00
1 Brisbane QLD 2023-01-03 10:00
+ +- **Staging Table (stg_source_1_appnd)** + + .. raw:: html + + + + + + + + + + + + +
customer_id load_timestamp
1 2023-01-01 10:00
2 2023-01-01 10:00
1 2023-01-01 10:00
2 2023-01-01 10:00
4 2023-01-01 10:00
1 2023-01-02 10:00
3 2023-01-02 10:00
2 2023-01-02 10:00
3 2023-01-02 10:00
1 2023-01-03 10:00
+ +- **Staging Table (stg_source_1_mrg)** + + .. raw:: html + + + + + + + + + + +
customer_id _START_AT _END_AT
1 2023-01-03 10:00 NULL
1 2023-01-02 10:00 2023-01-03 10:00
1 2023-01-01 10:00 2023-01-02 10:00
2 2023-01-02 10:00 NULL
2 2023-01-01 10:00 2023-01-02 10:00
3 2023-01-01 10:00 NULL
4 2023-01-02 10:00 NULL
4 2023-01-01 10:00 2023-01-02 10:00
+ +- **Target Table** + + - Append-Only Scenario + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name full_name email city state load_timestamp
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW 2023-01-02 10:00
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00
1 John Doe John Doe jdoe@example.com Brisbane QLD 2023-01-02 10:00
+ + - SCD1 Scenario + + .. raw:: html + + + + + + +
customer_id first_name last_name full_name email city state
1 John Doe John Doe jdoe@example.com Brisbane QLD
2 Jane Smith Jane Smith jane.smith@example.com Perth WA
3 Alice Green alice.green@example.com alice.green@example.com Sydney NSW
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS
+ + - SCD2 Scenario + + .. raw:: html + + + + + + + + + +
customer_id first_name last_name full_name email city state _START_AT _END_AT
1 John Doe John Doe jdoe@example.com Brisbane QLD 2023-01-03 10:00 NULL
1 John Doe John Doe jdoe@example.com Melbourne VIC 2023-01-02 10:00 2023-01-03 10:00
1 John Doe John Doe john.doe@example.com Melbourne VIC 2023-01-01 10:00 2023-01-03 10:00
2 Jane Smith Jane Smith jane.smith@example.com Perth WA 2023-01-02 10:00 NULL
2 Jane Smith Jane Smith jane.smith@example.com Melbourne VIC 2023-01-01 10:00 2023-01-02 10:00
3 Alice Green Alice Green alice.green@example.com Sydney NSW 2023-01-01 10:00 NULL
4 Joe Bloggs Joe Bloggs joe.bloggs@example.com Hobart TAS 2023-01-02 10:00 NULL
\ No newline at end of file diff --git a/docs/source/pipeline_execution.rst b/docs/source/pipeline_execution.rst new file mode 100644 index 0000000..371b4a5 --- /dev/null +++ b/docs/source/pipeline_execution.rst @@ -0,0 +1,10 @@ +Pipeline Execution +########################## + +*Under Construction* + +Useful Pipeline Settings +======================= + +Ignoring Validation Errors: :doc:`feature_validation` +Logging: :doc:`feature_logging` diff --git a/docs/source/splitting_dataflow_spec.rst b/docs/source/splitting_dataflow_spec.rst new file mode 100644 index 0000000..7e16840 --- /dev/null +++ b/docs/source/splitting_dataflow_spec.rst @@ -0,0 +1,384 @@ +Splitting Flows Data Flow Spec into main and flow files +------------------------------------------------------- +A data flow spec can be broken up into a main (ending with ``_main.json|yaml``) and flow (ending with ``_flow.json|yaml``) spec file. + +The main spec file will contain the main pipeline configuration and the flow spec file will contain the flow configuration and are joined by having the same dataFlowId. + +To achieve this, the main spec file will have the structure described in the :doc:`dataflow_spec_ref_main_flows` schema without the :ref:`_flow-group-configuration` property and this will instead be moved to the flow spec file +The flow spec file will have the structure described in the :ref:`_flow-group-configuration` schema but the ``dataFlowID`` is now required as it will serve as the link to the main spec. + +Below is a sample of how a Data Flow Spec can be split into main and flow spec files: + +Original Data Flow Spec file (single unsplit file): + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "etp5stg", + "dataFlowGroup": "etp5", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "staging_table_mrg_p5", + "schemaPath": "", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "partitionColumns": [] + }, + "cdcSettings": { + "keys": [ + "CONTRACT_ID" + ], + "sequence_by": "EXTRACT_DTTM", + "where": "", + "ignore_null_updates": true, + "except_column_list": [ + "__START_AT", + "__END_AT" + ], + "scd_type": "2", + "track_history_column_list": [], + "track_history_except_column_list": [] + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "off", + "quarantineTargetDetails": {}, + "flowGroups": [ + + { + "flowGroupId": "et1", + "stagingTables": { + "staging_table_apnd_p5": { + "type": "ST", + "schemaPath": "" + } + }, + "flows": { + "f_contract": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_p5", + "sourceView": "v_brz_contract" + }, + "views": { + "v_brz_contract": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_test_4", + "table": "contract", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_loan": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_p5", + "sourceView": "v_brz_loan" + }, + "views": { + "v_brz_loan": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_test_4", + "table": "loan", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "staging_table_mrg_p5", + "sourceView": "staging_table_apnd_p5" + } + } + } + } + ] + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: etp5stg + dataFlowGroup: etp5 + dataFlowType: flow + targetFormat: delta + targetDetails: + table: staging_table_mrg_p5 + schemaPath: '' + tableProperties: + delta.enableChangeDataFeed: 'true' + partitionColumns: [] + cdcSettings: + keys: + - CONTRACT_ID + sequence_by: EXTRACT_DTTM + where: '' + ignore_null_updates: true + except_column_list: + - __START_AT + - __END_AT + scd_type: '2' + track_history_column_list: [] + track_history_except_column_list: [] + dataQualityExpectationsEnabled: false + quarantineMode: 'off' + quarantineTargetDetails: {} + flowGroups: + - flowGroupId: et1 + stagingTables: + staging_table_apnd_p5: + type: ST + schemaPath: '' + flows: + f_contract: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_p5 + sourceView: v_brz_contract + views: + v_brz_contract: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_test_4 + table: contract + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_loan: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_p5 + sourceView: v_brz_loan + views: + v_brz_loan: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_test_4 + table: loan + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_merge: + flowType: merge + flowDetails: + targetTable: staging_table_mrg_p5 + sourceView: staging_table_apnd_p5 + +Split Data Flow Spec into main and flow files: + +Main file: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "etp5stg", + "dataFlowGroup": "etp5", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "staging_table_mrg_p5", + "schemaPath": "", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "partitionColumns": [] + }, + "cdcSettings": { + "keys": [ + "CONTRACT_ID" + ], + "sequence_by": "EXTRACT_DTTM", + "where": "", + "ignore_null_updates": true, + "except_column_list": [ + "__START_AT", + "__END_AT" + ], + "scd_type": "2", + "track_history_column_list": [], + "track_history_except_column_list": [] + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "off", + "quarantineTargetDetails": {} + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: etp5stg + dataFlowGroup: etp5 + dataFlowType: flow + targetFormat: delta + targetDetails: + table: staging_table_mrg_p5 + schemaPath: '' + tableProperties: + delta.enableChangeDataFeed: 'true' + partitionColumns: [] + cdcSettings: + keys: + - CONTRACT_ID + sequence_by: EXTRACT_DTTM + where: '' + ignore_null_updates: true + except_column_list: + - __START_AT + - __END_AT + scd_type: '2' + track_history_column_list: [] + track_history_except_column_list: [] + dataQualityExpectationsEnabled: false + quarantineMode: 'off' + quarantineTargetDetails: {} + +Flow file: + +.. tabs:: + + .. tab:: JSON + + .. code-block:: json + + { + "dataFlowId": "etp5stg", + "flowGroupId": "et1", + "stagingTables": { + "staging_table_apnd_p5": { + "type": "ST", + "schemaPath": "" + } + }, + "flows": { + "f_contract": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_p5", + "sourceView": "v_brz_contract" + }, + "views": { + "v_brz_contract": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_test_4", + "table": "contract", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_loan": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "staging_table_apnd_p5", + "sourceView": "v_brz_loan" + }, + "views": { + "v_brz_loan": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "main.bronze_test_4", + "table": "loan", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "whereClause": [] + } + } + } + }, + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "staging_table_mrg_p5", + "sourceView": "staging_table_apnd_p5" + } + } + } + } + + .. tab:: YAML + + .. code-block:: yaml + + dataFlowId: etp5stg + flowGroupId: et1 + stagingTables: + staging_table_apnd_p5: + type: ST + schemaPath: '' + flows: + f_contract: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_p5 + sourceView: v_brz_contract + views: + v_brz_contract: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_test_4 + table: contract + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_loan: + flowType: append_view + flowDetails: + targetTable: staging_table_apnd_p5 + sourceView: v_brz_loan + views: + v_brz_loan: + mode: stream + sourceType: delta + sourceDetails: + database: main.bronze_test_4 + table: loan + cdfEnabled: true + selectExp: + - '*' + whereClause: [] + f_merge: + flowType: merge + flowDetails: + targetTable: staging_table_mrg_p5 + sourceView: staging_table_apnd_p5 diff --git a/fixtures/.gitkeep b/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/pipeline_bundle_template/.gitignore b/pipeline_bundle_template/.gitignore new file mode 100644 index 0000000..bc4bd13 --- /dev/null +++ b/pipeline_bundle_template/.gitignore @@ -0,0 +1,7 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/ diff --git a/pipeline_bundle_template/README.md b/pipeline_bundle_template/README.md new file mode 100644 index 0000000..cd7403b --- /dev/null +++ b/pipeline_bundle_template/README.md @@ -0,0 +1,54 @@ +# bronze_sample + +The 'bronze_sample' project was generated by using the default-python template. + +## Prerequisites: +1. Execute the setup_data Notebook once bundle is deployed, to setup the Staging source tables and data. + +## Getting started + +1. Update the databricks.yml file with appropriate details (line 4 and line 23 and 25). + +1. Update the pipelines yml's in the resources folder accordingly: + - Change schemas. + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +1. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] silver_ar_job` to your workspace. + You can find that job by opening your workpace and clicking on **Workflows**. + +1. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + + Note that the default job from the template has a schedule that runs every day + (defined in resources/silver_ar_job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + +1. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +1. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. + +1. For documentation on the Databricks asset bundles format used + for this project, and for CI/CD configuration, see + https://docs.databricks.com/dev-tools/bundles/index.html. diff --git a/pipeline_bundle_template/databricks.yml b/pipeline_bundle_template/databricks.yml new file mode 100644 index 0000000..f728fa0 --- /dev/null +++ b/pipeline_bundle_template/databricks.yml @@ -0,0 +1,30 @@ +# This is a Databricks asset bundle definition for bronze_sample. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: bronze_sample + +include: + - resources/*.yml + +variables: + catalog: + description: The target UC catalog + framework_source_path: + description: The full workspace path to the framwework src folder + default: /Workspace/Users//.bundle/dlt_framework//current/files/src + schema: + description: The target UC schema + workspace_host: + description: workspace url used for API calls from Framework (usually same as deployment URL) e.g. https://e2-demo-field-eng.cloud.databricks.com/ + layer: + description: The target layer + default: bronze + logical_env: + description: The logical environment + default: "" + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + mode: development + default: true diff --git a/pipeline_bundle_template/fixtures/.gitkeep b/pipeline_bundle_template/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/pipeline_bundle_template/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/pipeline_bundle_template/pytest.ini b/pipeline_bundle_template/pytest.ini new file mode 100644 index 0000000..80432c2 --- /dev/null +++ b/pipeline_bundle_template/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = src diff --git a/pipeline_bundle_template/resources/_pipeline.yml b/pipeline_bundle_template/resources/_pipeline.yml new file mode 100644 index 0000000..20f52fa --- /dev/null +++ b/pipeline_bundle_template/resources/_pipeline.yml @@ -0,0 +1,20 @@ +resources: + pipelines: + dlt_framework__pipeline: + name: dlt_framework__pipeline${var.logical_env} + channel: CURRENT + serverless: true + catalog: ${var.catalog} + target: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: /Workspace/${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: diff --git a/pipeline_bundle_template/src/dataflows//dataflowspec/[flow]_main.json b/pipeline_bundle_template/src/dataflows//dataflowspec/[flow]_main.json new file mode 100644 index 0000000..555d733 --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//dataflowspec/[flow]_main.json @@ -0,0 +1,43 @@ +{ + "dataFlowId": "", + "dataFlowGroup": "", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "_schema.json" + }, + "dataQualityExpectationsEnabled": true, + "quarantineMode": "", + "quarantineTargetDetails": { + "targetFormat": "delta" + }, + "flowGroups": [ + { + "flowGroupId": "", + "flows": { + "f_target": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "", + "sourceView": "" + }, + "views": { + "v_": { + "mode": "stream", + "sourceType": "", + "sourceDetails": { + "database": "{}", + "table": "", + "cdfEnabled": true + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows//dataflowspec/[standard]_main.json b/pipeline_bundle_template/src/dataflows//dataflowspec/[standard]_main.json new file mode 100644 index 0000000..3871f44 --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//dataflowspec/[standard]_main.json @@ -0,0 +1,29 @@ +{ + "dataFlowId": "", + "dataFlowGroup": "", + "dataFlowType": "standard", + "sourceSystem": "", + "sourceType": "", + "sourceViewName": "", + "sourceDetails": { + "database": "{}", + "table": "", + "cdfEnabled": true, + "selectExp": [ + "", + "", + "" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "_schema.json" + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "" +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows//expectations/_dqe.json b/pipeline_bundle_template/src/dataflows//expectations/_dqe.json new file mode 100644 index 0000000..808265a --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//expectations/_dqe.json @@ -0,0 +1,24 @@ +{ + "expect_or_drop": [ + { + "name": "", + "constraint": "", + "tag": "", + "enabled": true + }, + { + "name": "", + "constraint": "", + "tag": "", + "enabled": false + } + ], + "expect_or_fail": [ + { + "name": "", + "constraint": "", + "tag": "", + "enabled": true + } + ] +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows//schemas/_schema.json b/pipeline_bundle_template/src/dataflows//schemas/_schema.json new file mode 100644 index 0000000..340285d --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//schemas/_schema.json @@ -0,0 +1,17 @@ +{ + "type": "struct", + "fields": [ + { + "name": "", + "type": "", + "nullable": true, + "metadata": {} + }, + { + "name": "", + "type": "", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows//dataflowspec/[flow]_main.json b/pipeline_bundle_template/src/dataflows//dataflowspec/[flow]_main.json new file mode 100644 index 0000000..555d733 --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//dataflowspec/[flow]_main.json @@ -0,0 +1,43 @@ +{ + "dataFlowId": "", + "dataFlowGroup": "", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "_schema.json" + }, + "dataQualityExpectationsEnabled": true, + "quarantineMode": "", + "quarantineTargetDetails": { + "targetFormat": "delta" + }, + "flowGroups": [ + { + "flowGroupId": "", + "flows": { + "f_target": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "", + "sourceView": "" + }, + "views": { + "v_": { + "mode": "stream", + "sourceType": "", + "sourceDetails": { + "database": "{}", + "table": "", + "cdfEnabled": true + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows//dataflowspec/[standard]_main.json b/pipeline_bundle_template/src/dataflows//dataflowspec/[standard]_main.json new file mode 100644 index 0000000..3871f44 --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//dataflowspec/[standard]_main.json @@ -0,0 +1,29 @@ +{ + "dataFlowId": "", + "dataFlowGroup": "", + "dataFlowType": "standard", + "sourceSystem": "", + "sourceType": "", + "sourceViewName": "", + "sourceDetails": { + "database": "{}", + "table": "", + "cdfEnabled": true, + "selectExp": [ + "", + "", + "" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "_schema.json" + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "" +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows//expectations/_dqe.json b/pipeline_bundle_template/src/dataflows//expectations/_dqe.json new file mode 100644 index 0000000..808265a --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//expectations/_dqe.json @@ -0,0 +1,24 @@ +{ + "expect_or_drop": [ + { + "name": "", + "constraint": "", + "tag": "", + "enabled": true + }, + { + "name": "", + "constraint": "", + "tag": "", + "enabled": false + } + ], + "expect_or_fail": [ + { + "name": "", + "constraint": "", + "tag": "", + "enabled": true + } + ] +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/dataflows//schemas/_schema.json b/pipeline_bundle_template/src/dataflows//schemas/_schema.json new file mode 100644 index 0000000..340285d --- /dev/null +++ b/pipeline_bundle_template/src/dataflows//schemas/_schema.json @@ -0,0 +1,17 @@ +{ + "type": "struct", + "fields": [ + { + "name": "", + "type": "", + "nullable": true, + "metadata": {} + }, + { + "name": "", + "type": "", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/pipeline_bundle_template/src/pipeline_configs/_substitutions.json b/pipeline_bundle_template/src/pipeline_configs/_substitutions.json new file mode 100644 index 0000000..ec2d947 --- /dev/null +++ b/pipeline_bundle_template/src/pipeline_configs/_substitutions.json @@ -0,0 +1,12 @@ +{ + "tokens": { + "": "", + "": "" + }, + "prefix_suffix": { + "": { + "prefix": "", + "suffix": "" + } + } +} \ No newline at end of file diff --git a/pipeline_bundle_template/tests/main_test.py b/pipeline_bundle_template/tests/main_test.py new file mode 100644 index 0000000..333ffa3 --- /dev/null +++ b/pipeline_bundle_template/tests/main_test.py @@ -0,0 +1,6 @@ +from bronze_sample.main import get_taxis, get_spark + + +def test_main(): + taxis = get_taxis(get_spark()) + assert taxis.count() > 5 diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..80432c2 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = src diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..80d389e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,16 @@ +## requirements-dev.txt: dependencies for local development. +databricks-connect +databricks-dlt +delta +pyspark +jsonschema + +## testing packages +pytest + +## Dependencies for building wheel files +setuptools +wheel + +# The following packages are only required for building documentation and are not required at runtime +-r requirements-docs.txt diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 0000000..dc78245 --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,10 @@ +myst-parser +nbsphinx +sphinx +sphinx-toolbox +sphinx-autoapi +sphinx-markdown-builder +sphinx-copybutton +sphinx_rtd_theme +sphinx-design +sphinxcontrib-spelling diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..67421ac --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +## requirements.txt: dependencies for runtime. +## Core dependencies +jsonschema + +## Add any additional dependencies needed for custom functionality below here \ No newline at end of file diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 0000000..7fe451f --- /dev/null +++ b/samples/README.md @@ -0,0 +1,154 @@ +# The Samples + +The Framework comes with extensive samples that demonstrate the use of the framework and Lakeflow concepts. At the time of writing, sample are organized into the following bundles: + +* Bronze +* Silver +* Gold +* Test Data and Orchestrator +* TPC-H + +The samples broadly break down into the following: + +| Sample Type | Folder | Description | +|-------------|--------|-------------| +| **Base and Pattern Samples** | - `/src/dataflows/base_samples`
- `/src/dataflows/` | Bronze, Silver and Gold samples that demonstrate the patterns and data examples used in the patterns section of the documentation | +| **Feature Samples** | `/src/dataflows/feature_samples` | Sample per key feature | +| **Kafka Samples** | `/src/dataflows/kafka_samples` | Base Kafka, Confluent schema registry and SQL off Kafka samples | +| **TPC-H Sample** | Separate bundle for TPC-H samples | Based on TPC-H schema in UC samples catalog, reverse engineered to demonstrate end to end streaming data warehouse | + +## Deploying the Samples + +The samples can be deployed using the scripts located in the `samples` directory: + +* `deploy.sh`: Deploys all the samples execpt for TPC-H. +* `deploy_bronze.sh`: Deploys only the bronze samples. +* `deploy_silver.sh`: Deploys only the silver samples. +* `deploy_gold.sh`: Deploys only the gold samples. +* `deploy_orchestrator.sh`: Deploys only the test data and orchestrator bundle. +* `deploy_tpch.sh`: Deploys only the TPC-H sample. + +### Prerequisites: + +* Databricks CLI installed and configured +* Lakeflow framework already deployed to your workspace (see deploy_framework) + +### Interactive Deployment + +1. Navigate to the samples directory in the root of the Framework repository: + + ```console + cd samples + ``` + +2. Run the desired deploy script: + + ```console + ./deploy.sh + ``` + +3. Follow the prompts to deploy the samples. + + * **Databricks username**: Your Databricks username in the workspace you are deploying to e.g. `jane.doe@company.com`. + * **Databricks workspace**: The full URL of the workspace you are deploying to e.g. `https://company.cloud.databricks.com`. + * **Databricks CLI profile**: The Databricks CLI profile you want to use for the deployment. Default: `DEFAULT`. + * **Select Compute**: Select between Classic/Enhaced or Serverless compute (0=Enhanced, 1=Serverless). Default: `1`. + * **UC Catalog**: The Unity Catalog you want to use for the deployment. Default: `main`. + * **Schema Namespace**: The first part of the name for the bronze, silver and gold schemas. Default: `lakeflow_samples`. + * **Logical environment**: The logical environment you want to use for the deployment e.g. `_test`. + + > **Important:** + > + > Always specify a logical environment when deploying the samples, this ensures you don't anyone elses existing samples in the workspace, as long as the logical environment is unique. + > + > Suggested naming: + > + > * Your initials, e.g Jane Doe would be `_jd` + > * A Story ID, e.g `123456` would be `_123456` + > * Your client name, e.g Company would be `_client` + > * Others: business unit, team name, project name, etc... + +4. Once deployment is complete, you can find the deployed bundles under `/Users//.bundle/` + +### Single Command line deployment: + +1. Navigate to the samples directory in the root of the Framework repository: + + ```console + cd samples + ``` + +2. Run the desired deploy script with required parameters: + + ```console + ./deploy.sh -u -h [-p ] [-l ] [--catalog ] + ``` + + Parameters: + + * `-u, --user`: Your Databricks username (required) + * `-h, --host`: Databricks workspace host URL (required) + * `-p, --profile`: Databricks CLI profile (optional). Default: `DEFAULT`. + * `-c, --compute`: The type of compute to use (0=Enhanced, 1=Serverless). Default: `1`. + * `-l, --logical_env`: Logical environment suffix for schema names (optional). Default: `_test`. + * `--catalog`: Unity Catalog name (optional). Default: `main`. + * `--schema_namespace`: Overide the first part of the name for the bronze, silver and gold schemas (optional). Default: `lakeflow_samples`. + + For example: + + ```console + ./deploy.sh -u jane.doe@company.com -h https://company.cloud.databricks.com -l _jd + ``` + +4. Once deployment is complete, you can find the deployed bundles under `/Users//.bundle/` + +## Using the Samples + +### Test Data and Orchestrator + +The Test Data and Orchestrator bundle includes: + +* Test data initialization and load simulation +* Multiple job to simulate end to end runs of the samples + +#### Jobs + +After deployment you should find the following jobs in your workspace: + +* Lakeflow Framework Samples - Run 1 - Load and Schema Initialization +* Lakeflow Framework Samples - Run 2 - Load +* Lakeflow Framework Samples - Run 3 - Load +* Lakeflow Framework Samples - Run 4 - Load + +These will be prefixed with the target and your username and suffixed with the logical environment you provided when deploying the samples. + +For example: +`[dev jane_doe] Lakeflow Framework Samples - Run 1 - Load and Schema Initialization (_jd)` + +To execute the samples, simply execute the jobs in order to simulate the end to end run of the samples over the test data. + +#### Pipelines + +You can also of course execute individual pipelines as well, these also follow a similiar name convention with `Lakeflow Samples` in the name. + +## Destroying the Samples + +To destroy the samples, you can use the `destroy.sh` script following the command specified below. + +```console +./destroy.sh -h [-p ] [-l ] +``` + +Parameters: + +* `-h, --host`: Databricks workspace host URL (required) +* `-p, --profile`: Databricks CLI profile (optional, defaults to DEFAULT) +* `-l, --logical_env`: Logical environment suffix for schema names (optional) + +## TPC-H Sample + +The TPC-H sample is based off the TPC-H schema in the UC catalog and reverse engineered to demonstrate end to end streaming data warehouse. + +To deploy the TPC-H sample, you can use the `deploy_tpch.sh` script following the same methods specified above. + +This sample is currently still being built with an initial cut targetted for Sept 2025. \ No newline at end of file diff --git a/samples/bronze_sample/.gitignore b/samples/bronze_sample/.gitignore new file mode 100644 index 0000000..bc4bd13 --- /dev/null +++ b/samples/bronze_sample/.gitignore @@ -0,0 +1,7 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/ diff --git a/samples/bronze_sample/.vscode/settings.json b/samples/bronze_sample/.vscode/settings.json new file mode 100644 index 0000000..1a79a81 --- /dev/null +++ b/samples/bronze_sample/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------" +} \ No newline at end of file diff --git a/samples/bronze_sample/databricks.yml b/samples/bronze_sample/databricks.yml new file mode 100644 index 0000000..38f3dee --- /dev/null +++ b/samples/bronze_sample/databricks.yml @@ -0,0 +1,37 @@ +# This is a Databricks asset bundle definition for bronze_sample. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: bronze_sample + +include: + - scratch/resources/*.yml + +variables: + catalog: + description: The target UC catalog + framework_source_path: + description: The full workspace path to the framwework src folder + schema: + description: The target UC schema + workspace_host: + description: workspace url used for API calls from Framework (usually same as deployment URL) e.g. https://e2-demo-field-eng.cloud.databricks.com/ + layer: + description: The target layer + default: bronze + logical_env: + description: The logical environment + default: "" + pipeline_cluster_config: + description: Basic cluster config, add node types as necessary + default: + label: default + autoscale: + min_workers: 1 + max_workers: 5 + mode: ENHANCED + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + mode: development + default: true diff --git a/samples/bronze_sample/fixtures/.gitkeep b/samples/bronze_sample/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/samples/bronze_sample/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/samples/bronze_sample/pytest.ini b/samples/bronze_sample/pytest.ini new file mode 100644 index 0000000..80432c2 --- /dev/null +++ b/samples/bronze_sample/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = src diff --git a/samples/bronze_sample/resources/classic/bronze_base_sample_pipeline.yml b/samples/bronze_sample/resources/classic/bronze_base_sample_pipeline.yml new file mode 100644 index 0000000..bd0c7a9 --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_base_sample_pipeline.yml @@ -0,0 +1,23 @@ +resources: + pipelines: + lakeflow_samples_bronze_base_pipeline: + name: Lakeflow Framework - Bronze - Base Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + #pipeline.dataFlowIdFilter: base_customer,base_customer_file_source,base_customer_address + pipeline.dataFlowGroupFilter: base_samples + root_path: ${workspace.file_path}/src/dataflows/base_samples diff --git a/samples/bronze_sample/resources/classic/bronze_feature_samples_data_quality_pipeline.yml b/samples/bronze_sample/resources/classic/bronze_feature_samples_data_quality_pipeline.yml new file mode 100644 index 0000000..8ae0f57 --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_feature_samples_data_quality_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_data_quality_pipeline: + name: Lakeflow Framework - Feature Samples - Data Quality Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_data_quality + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/classic/bronze_feature_samples_pipeline_general.yml b/samples/bronze_sample/resources/classic/bronze_feature_samples_pipeline_general.yml new file mode 100644 index 0000000..2b0e494 --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_feature_samples_pipeline_general.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_pipeline_general: + name: Lakeflow Framework - Feature Samples - General Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_general + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/classic/bronze_feature_samples_pipeline_python.yml b/samples/bronze_sample/resources/classic/bronze_feature_samples_pipeline_python.yml new file mode 100644 index 0000000..1f5f8a9 --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_feature_samples_pipeline_python.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_pipeline_python: + name: Lakeflow Framework - Feature Samples - Python Pipeline (${var.logical_env}) + channel: CURRENT + catalog: ${var.catalog} + schema: ${var.schema} + clusters: + - ${var.pipeline_cluster_config} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_python + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/classic/bronze_feature_samples_snapshots_pipeline.yml b/samples/bronze_sample/resources/classic/bronze_feature_samples_snapshots_pipeline.yml new file mode 100644 index 0000000..246212f --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_feature_samples_snapshots_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_snapshots_pipeline: + name: Lakeflow Framework - Feature Samples - Snapshots Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_snapshots + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/classic/bronze_feature_samples_table_migration_pipeline.yml b/samples/bronze_sample/resources/classic/bronze_feature_samples_table_migration_pipeline.yml new file mode 100644 index 0000000..96aadd0 --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_feature_samples_table_migration_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_table_migration_pipeline: + name: Lakeflow Framework - Feature Samples - Table Migration Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_table_migration + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/classic/bronze_kafka_samples_pipeline.yml b/samples/bronze_sample/resources/classic/bronze_kafka_samples_pipeline.yml new file mode 100644 index 0000000..166933f --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_kafka_samples_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_bronze_kafka_samples_pipeline: + name: Lakeflow Framework - Bronze - Kafka Samples Pipeline (${var.logical_env}) + catalog: ${var.catalog} + schema: ${var.schema} + channel: PREVIEW + clusters: + - ${var.pipeline_cluster_config} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + bundle.target: ${bundle.target} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: kafka_samples + root_path: ${workspace.file_path}/src/dataflows/kafka_samples diff --git a/samples/bronze_sample/resources/classic/bronze_template_samples_pipeline.yml b/samples/bronze_sample/resources/classic/bronze_template_samples_pipeline.yml new file mode 100644 index 0000000..f1df12b --- /dev/null +++ b/samples/bronze_sample/resources/classic/bronze_template_samples_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_bronze_template_samples_pipeline: + name: Lakeflow Framework - Bronze - Template Samples Pipeline (${var.logical_env}) + channel: CURRENT + catalog: ${var.catalog} + schema: ${var.schema} + clusters: + - ${var.pipeline_cluster_config} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: template_samples + root_path: ${workspace.file_path}/src/dataflows/template_samples \ No newline at end of file diff --git a/samples/bronze_sample/resources/serverless/bronze_base_sample_pipeline.yml b/samples/bronze_sample/resources/serverless/bronze_base_sample_pipeline.yml new file mode 100644 index 0000000..d6cdc06 --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_base_sample_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_bronze_base_pipeline: + name: Lakeflow Framework - Bronze - Base Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + #pipeline.dataFlowIdFilter: base_customer,base_customer_file_source,base_customer_address + pipeline.dataFlowGroupFilter: base_samples + root_path: ${workspace.file_path}/src/dataflows/base_samples diff --git a/samples/bronze_sample/resources/serverless/bronze_feature_samples_data_quality_pipeline.yml b/samples/bronze_sample/resources/serverless/bronze_feature_samples_data_quality_pipeline.yml new file mode 100644 index 0000000..03d6c0f --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_feature_samples_data_quality_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_data_quality_pipeline: + name: Lakeflow Framework - Feature Samples - Data Quality Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_data_quality + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/serverless/bronze_feature_samples_pipeline_general.yml b/samples/bronze_sample/resources/serverless/bronze_feature_samples_pipeline_general.yml new file mode 100644 index 0000000..7c96445 --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_feature_samples_pipeline_general.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_pipeline_general: + name: Lakeflow Framework - Feature Samples - General Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_general + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/serverless/bronze_feature_samples_pipeline_python.yml b/samples/bronze_sample/resources/serverless/bronze_feature_samples_pipeline_python.yml new file mode 100644 index 0000000..2dafd9f --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_feature_samples_pipeline_python.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_pipeline_python: + name: Lakeflow Framework - Feature Samples - Python Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_python + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/serverless/bronze_feature_samples_snapshots_pipeline.yml b/samples/bronze_sample/resources/serverless/bronze_feature_samples_snapshots_pipeline.yml new file mode 100644 index 0000000..be8aea9 --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_feature_samples_snapshots_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_snapshots_pipeline: + name: Lakeflow Framework - Feature Samples - Snapshots Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_snapshots + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/serverless/bronze_feature_samples_table_migration_pipeline.yml b/samples/bronze_sample/resources/serverless/bronze_feature_samples_table_migration_pipeline.yml new file mode 100644 index 0000000..6dcd471 --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_feature_samples_table_migration_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_feature_samples_table_migration_pipeline: + name: Lakeflow Framework - Feature Samples - Table Migration Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: feature_samples_table_migration + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/bronze_sample/resources/serverless/bronze_kafka_samples_pipeline.yml b/samples/bronze_sample/resources/serverless/bronze_kafka_samples_pipeline.yml new file mode 100644 index 0000000..a395b8b --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_kafka_samples_pipeline.yml @@ -0,0 +1,20 @@ +resources: + pipelines: + lakeflow_samples_bronze_kafka_samples_pipeline: + name: Lakeflow Framework - Bronze - Kafka Samples Pipeline (${var.logical_env}) + catalog: ${var.catalog} + schema: ${var.schema} + channel: PREVIEW + serverless: true + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + bundle.target: ${bundle.target} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: kafka_samples + root_path: ${workspace.file_path}/src/dataflows/kafka_samples diff --git a/samples/bronze_sample/resources/serverless/bronze_template_samples_pipeline.yml b/samples/bronze_sample/resources/serverless/bronze_template_samples_pipeline.yml new file mode 100644 index 0000000..bb9d119 --- /dev/null +++ b/samples/bronze_sample/resources/serverless/bronze_template_samples_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_bronze_template_samples_pipeline: + name: Lakeflow Framework - Bronze - Template Samples Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: template_samples + root_path: ${workspace.file_path}/src/dataflows/template_samples \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_address_main.json b/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_address_main.json new file mode 100644 index 0000000..19c1436 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_address_main.json @@ -0,0 +1,28 @@ +{ + "dataFlowId": "base_customer_address", + "dataFlowGroup": "base_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_customer_address", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_address", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_address", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_address_schema.json" + }, + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_file_source_main.json b/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_file_source_main.json new file mode 100644 index 0000000..9f78a92 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_file_source_main.json @@ -0,0 +1,41 @@ +{ + "dataFlowId": "base_customer_file_source", + "dataFlowGroup": "base_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "cloudFiles", + "sourceViewName": "v_customer_files", + "sourceDetails": { + "path": "{sample_file_location}/customer/", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "./customer_file_source_schema.json", + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "CAST(LOAD_TIMESTAMP AS TIMESTAMP) AS LOAD_TIMESTAMP" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_file_sample", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_schema.json" + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "sequence_by": "LOAD_TIMESTAMP", + "where": "", + "ignore_null_updates": false, + "scd_type": "1" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_main.json b/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_main.json new file mode 100644 index 0000000..34f25ba --- /dev/null +++ b/samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_main.json @@ -0,0 +1,26 @@ +{ + "dataFlowId": "base_customer", + "dataFlowGroup": "base_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_customer", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer", + "comment": "Customer main table", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_schema.json", + "sparkConf": { + "spark.sql.session.timeZone": "UTC" + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/base_samples/expectations/customer_address_dqe.json b/samples/bronze_sample/src/dataflows/base_samples/expectations/customer_address_dqe.json new file mode 100644 index 0000000..7405996 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/base_samples/expectations/customer_address_dqe.json @@ -0,0 +1,15 @@ +{ + "expect_or_drop": [ + { + "name": "PK not null", + "constraint": "CUSTOMER_ID IS NOT NULL", + "tag": "Validity" + }, + { + "name": "enabledTest", + "constraint": "CUSTOMER_ID = 1", + "tag": "Validity", + "enabled": false + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_address_schema.json b/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_address_schema.json new file mode 100644 index 0000000..23597cf --- /dev/null +++ b/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "CITY", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "STATE", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_file_source_schema.json b/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_file_source_schema.json new file mode 100644 index 0000000..9194cb7 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_file_source_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_schema.json b/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_schema.json new file mode 100644 index 0000000..9194cb7 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/base_samples/schemas/customer_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/append_sql_flow_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/append_sql_flow_main.json new file mode 100644 index 0000000..fc997fe --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/append_sql_flow_main.json @@ -0,0 +1,30 @@ +{ + "dataFlowId": "append_sql_flow", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "append_sql_flow", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_address_schema.json" + }, + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "flag", + "flowGroups": [ + { + "flowGroupId": "main", + "flows": { + "f_customer_address_append_sql": { + "flowType": "append_sql", + "flowDetails": { + "targetTable": "append_sql_flow", + "sqlStatement": "SELECT * FROM STREAM({staging_schema}.customer_address)" + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/append_view_flow_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/append_view_flow_main.json new file mode 100644 index 0000000..a88c07b --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/append_view_flow_main.json @@ -0,0 +1,37 @@ +{ + "dataFlowId": "append_view_flow", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "append_view_flow", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "flowGroups": [ + { + "flowGroupId": "main", + "flows": { + "f_customer_append_view": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "append_view_flow", + "sourceView": "v_append_view_flow" + }, + "views": { + "v_append_view_flow": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/ddl_schema_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/ddl_schema_main.json new file mode 100644 index 0000000..ee74dde --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/ddl_schema_main.json @@ -0,0 +1,22 @@ +{ + "dataFlowId": "feature_constraints", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_feature_constraints", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_constraints", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_ddl_schema.ddl" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_main.json new file mode 100644 index 0000000..e0709f8 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_main.json @@ -0,0 +1,34 @@ +{ + "dataFlowId": "feature_historical_files_snapshot_datetime", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_snapshot_files_datetime", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_schema.json", + "configFlags": ["disableOperationalMetadata"] + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "snapshotType": "historical", + "sourceType": "file", + "source": { + "format": "csv", + "path": "{sample_file_location}/snapshot_customer/customer_{version}.csv", + "readerOptions": { + "header": "true" + }, + "versionType": "timestamp", + "datetimeFormat": "%Y_%m_%d" + }, + "track_history_except_column_list":[ + "LOAD_TIMESTAMP" + ] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_main.json new file mode 100644 index 0000000..976cbf5 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_main.json @@ -0,0 +1,35 @@ +{ + "dataFlowId": "feature_historical_files_snapshot_datetime_recursive_and_partitioned", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_snapshot_files_datetime_recursive_and_partitioned", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_schema.json", + "configFlags": ["disableOperationalMetadata"] + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "snapshotType": "historical", + "sourceType": "file", + "source": { + "format": "csv", + "path": "{sample_file_location}/snapshot_customer_partitioned/{version}/customer.csv", + "readerOptions": { + "header": "true" + }, + "versionType": "timestamp", + "datetimeFormat": "YEAR=%Y/MONTH=%m/DAY=%d", + "recursiveFileLookup": true + }, + "track_history_except_column_list":[ + "LOAD_TIMESTAMP" + ] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_parquet_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_parquet_main.json new file mode 100644 index 0000000..d16c45e --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_datetime_recursive_and_partitioned_parquet_main.json @@ -0,0 +1,32 @@ +{ + "dataFlowId": "feature_historical_files_snapshot_datetime_recursive_and_partitioned_parquet", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_snapshot_files_datetime_recursive_and_partitioned_parquet", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_schema.json", + "configFlags": ["disableOperationalMetadata"] + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "snapshotType": "historical", + "sourceType": "file", + "source": { + "format": "parquet", + "path": "{sample_file_location}/snapshot_customer_partitioned_parquet/{version}/customer.parquet", + "versionType": "timestamp", + "datetimeFormat": "YEAR=%Y/MONTH=%m/DAY=%d", + "recursiveFileLookup": true + }, + "track_history_except_column_list":[ + "LOAD_TIMESTAMP" + ] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_flow_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_flow_main.json new file mode 100644 index 0000000..cc9661e --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_flow_main.json @@ -0,0 +1,80 @@ +{ + "dataFlowId": "feature_historical_files_snapshot_flow", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_files_snapshot_flow", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "flowGroups": [ + { + "flowGroupId": "main", + "stagingTables": { + "stg_feature_historical_files_snapshot_flow": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "1", + "snapshotType": "historical", + "sourceType": "file", + "source": { + "format": "csv", + "path": "{sample_file_location}/snapshot_customer/customer_{version}.csv", + "readerOptions": { + "header": "true" + }, + "versionType": "timestamp", + "datetimeFormat": "%Y_%m_%d", + "schemaPath": "source/feature_historic_snapshot_customer_schema.json", + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "TO_TIMESTAMP(LOAD_TIMESTAMP, 'yyyy-MM-dd HH:mm:ss') AS LOAD_TIMESTAMP" + ] + } + }, + "configFlags": ["disableOperationalMetadata"] + } + }, + "flows": { + "f_feature_historical_files_snapshot_append_flow": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "feature_historical_files_snapshot_flow", + "sourceView": "v_feature_historical_files_snapshot_append" + }, + "views": { + "v_feature_historical_files_snapshot_append": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "live", + "table": "stg_feature_historical_files_snapshot_flow", + "cdfEnabled": true, + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "_change_type AS META_CDC_CHANGE_TYPE" + ], + "startingVersionFromDLTSetup": true, + "cdfChangeTypeOverride": ["insert", "update_postimage", "delete"] + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_int_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_int_main.json new file mode 100644 index 0000000..dea075a --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_int_main.json @@ -0,0 +1,33 @@ +{ + "dataFlowId": "feature_historical_files_snapshot_int", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_snapshot_files_int", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_schema.json", + "configFlags": ["disableOperationalMetadata"] + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "snapshotType": "historical", + "sourceType": "file", + "source": { + "format": "csv", + "path": "{sample_file_location}/customer/customer_{version}.csv", + "readerOptions": { + "header": "true" + }, + "versionType": "integer" + }, + "track_history_except_column_list":[ + "LOAD_TIMESTAMP" + ] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_schema_and_select_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_schema_and_select_main.json new file mode 100644 index 0000000..acd9f24 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_files_schema_and_select_main.json @@ -0,0 +1,41 @@ +{ + "dataFlowId": "feature_historical_files_snapshot_schema_and_select", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_snapshot_files_schema_and_select", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "configFlags": ["disableOperationalMetadata"] + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "snapshotType": "historical", + "sourceType": "file", + "source": { + "format": "csv", + "path": "{sample_file_location}/snapshot_customer/customer_{version}.csv", + "readerOptions": { + "header": "true" + }, + "versionType": "timestamp", + "datetimeFormat": "%Y_%m_%d", + "schemaPath": "source/feature_historic_snapshot_customer_schema.json", + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "LOAD_TIMESTAMP" + ] + }, + "track_history_except_column_list":[ + "LOAD_TIMESTAMP" + ] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_table_datetime_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_table_datetime_main.json new file mode 100644 index 0000000..7291879 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_table_datetime_main.json @@ -0,0 +1,28 @@ +{ + "dataFlowId": "feature_historical_table_snapshot_datetime", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_snapshot_table_datetime", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_historic_snapshot_customer_schema.json", + "configFlags": ["disableOperationalMetadata"] + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "snapshotType": "historical", + "sourceType": "table", + "source": { + "table": "{staging_schema}.customer_historical_snapshot_source", + "versionColumn": "LOAD_TIMESTAMP", + "versionType": "timestamp" + }, + "track_history_except_column_list": ["LOAD_TIMESTAMP"] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_table_select_expression_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_table_select_expression_main.json new file mode 100644 index 0000000..15a6133 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/historical_snapshot_table_select_expression_main.json @@ -0,0 +1,34 @@ +{ + "dataFlowId": "feature_historical_table_snapshot_select_expression", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_historical_snapshot_table_select_expression", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "configFlags": ["disableOperationalMetadata"] + }, + "cdcSnapshotSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "snapshotType": "historical", + "sourceType": "table", + "source": { + "table": "{staging_schema}.customer_historical_snapshot_source", + "versionColumn": "LOAD_TIMESTAMP", + "versionType": "timestamp", + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "LOAD_TIMESTAMP" + ] + }, + "track_history_except_column_list": ["LOAD_TIMESTAMP"] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/materialized_views_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/materialized_views_main.json new file mode 100644 index 0000000..5d34b4f --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/materialized_views_main.json @@ -0,0 +1,46 @@ +{ + "dataFlowId": "feature_materialized_views", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "materialized_view", + "materializedViews": { + "feature_mv_source_view": { + "sourceView": { + "sourceViewName": "v_feature_mv_source_view", + "sourceType": "delta", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + } + } + }, + "feature_mv_sql_path": { + "sqlPath": "./feature_mv_sql_path.sql" + }, + "feature_mv_sql_statement": { + "sqlStatement": "SELECT * FROM {staging_schema}.customer", + "tableDetails": { + "private": true + } + }, + "feature_mv_with_quarantine": { + "sqlStatement": "SELECT * FROM {staging_schema}.customer_address", + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta", + "clusterByAuto": true + } + }, + "feature_mv_chain_mvs": { + "sqlStatement": "SELECT * FROM live.feature_mv_sql_statement", + "tableDetails": { + "comment": "Test Config", + "sparkConf": { + "spark.sql.session.timeZone": "Australia/Sydney" + } + } + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/periodic_snapshot_scd1_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/periodic_snapshot_scd1_main.json new file mode 100644 index 0000000..169bdb8 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/periodic_snapshot_scd1_main.json @@ -0,0 +1,36 @@ +{ + "dataFlowId": "feature_periodic_snapshot_scd1", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "sourceSystem": "test", + "sourceType": "delta", + "sourceViewName": "v_periodic_snapshot_customer_scd1", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_snapshot_source", + "cdfEnabled": false, + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "DELETE_FLAG" + ] + }, + "mode": "batch", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_periodic_snapshot_scd1", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_periodic_snapshot_customer_schema.json" + }, + "cdcSnapshotSettings": { + "snapshotType": "periodic", + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "1" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/periodic_snapshot_scd2_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/periodic_snapshot_scd2_main.json new file mode 100644 index 0000000..76cd704 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/periodic_snapshot_scd2_main.json @@ -0,0 +1,36 @@ +{ + "dataFlowId": "feature_periodic_snapshot_scd2", + "dataFlowGroup": "feature_samples_snapshots", + "dataFlowType": "standard", + "sourceSystem": "test", + "sourceType": "delta", + "sourceViewName": "v_periodic_snapshot_customer_scd2", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_snapshot_source", + "cdfEnabled": false, + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "DELETE_FLAG" + ] + }, + "mode": "batch", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_periodic_snapshot_scd2", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_periodic_snapshot_customer_schema.json" + }, + "cdcSnapshotSettings": { + "snapshotType": "periodic", + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_source_extension_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_source_extension_main.json new file mode 100644 index 0000000..d8c467d --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_source_extension_main.json @@ -0,0 +1,24 @@ +{ + "dataFlowId": "feature_python_extension_source", + "dataFlowGroup": "feature_samples_python", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "python", + "sourceViewName": "v_feature_python_extension_source", + "sourceDetails": { + "tokens": { + "sourceTable": "{staging_schema}.customer" + }, + "pythonModule": "sources.get_customer_cdf" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_extension_source", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_python_function_source_schema.json" + } +} + diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_source_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_source_main.json new file mode 100644 index 0000000..9743f32 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_source_main.json @@ -0,0 +1,23 @@ +{ + "dataFlowId": "feature_python_function_source", + "dataFlowGroup": "feature_samples_python", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "python", + "sourceViewName": "v_feature_python_function_source", + "sourceDetails": { + "tokens": { + "sourceTable": "{staging_schema}.customer" + }, + "functionPath": "feature_python_function_source.py" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_function_source", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_python_function_source_schema.json" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_transform_extension_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_transform_extension_main.json new file mode 100644 index 0000000..378cf68 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_transform_extension_main.json @@ -0,0 +1,26 @@ +{ + "dataFlowId": "feature_python_extension_transform", + "dataFlowGroup": "feature_samples_python", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_feature_python_extension_transform", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true, + "pythonTransform": { + "module": "transforms.customer_aggregation" + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_extension_transform", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_python_function_transform_schema.json" + } +} + diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_transform_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_transform_main.json new file mode 100644 index 0000000..68ae582 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/python_transform_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "feature_python_function_transform_transform", + "dataFlowGroup": "feature_samples_python", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_feature_python_function_transform", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true, + "pythonTransform": { + "functionPath": "feature_python_function_transform.py" + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_python_function_transform", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_python_function_transform_schema.json" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_flag_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_flag_main.json new file mode 100644 index 0000000..3c51575 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_flag_main.json @@ -0,0 +1,31 @@ +{ + "dataFlowId": "feature_quarantine_flag", + "dataFlowGroup": "feature_samples_data_quality", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_customer_address_feature_quarantine_flag", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": [ + "CUSTOMER_ID" + , "CITY" + , "STATE" + , "LOAD_TIMESTAMP" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_quarantine_flag", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_address_schema.json" + }, + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "flag" +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_table_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_table_main.json new file mode 100644 index 0000000..e476c0c --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_table_main.json @@ -0,0 +1,34 @@ +{ + "dataFlowId": "feature_quarantine_table", + "dataFlowGroup": "feature_samples_data_quality", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_customer_address_feature_quarantine_table", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": [ + "CUSTOMER_ID" + , "CITY" + , "STATE" + , "LOAD_TIMESTAMP" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_quarantine_table", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_address_schema.json" + }, + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_table_with_cdc_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_table_with_cdc_main.json new file mode 100644 index 0000000..610d36d --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/quarantine_table_with_cdc_main.json @@ -0,0 +1,41 @@ +{ + "dataFlowId": "feature_cdc_with_quarantine_table", + "dataFlowGroup": "feature_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_customer_address_feature_quarantine_table_cdc", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": [ + "CUSTOMER_ID" + , "CITY" + , "STATE" + , "LOAD_TIMESTAMP" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_cdc_with_quarantine_table", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_address_schema.json" + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "sequence_by": "LOAD_TIMESTAMP", + "scd_type": "2" + }, + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "./customer_address_dqe.json", + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_custom_python_generate_cdc_feed.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_custom_python_generate_cdc_feed.json new file mode 100644 index 0000000..635665e --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_custom_python_generate_cdc_feed.json @@ -0,0 +1,21 @@ +{ + "dataFlowId": "sink_delta_uc_table", + "dataFlowGroup": "feature_samples_general_DISABLED", + "dataFlowType": "standard", + "sourceSystem": "test_system", + "sourceType": "delta", + "sourceViewName": "v_customer_delta_sink_uc_table", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta_sink", + "targetDetails": { + "name": "sink_delta_uc_table", + "sinkOptions": { + "tableName": "{bronze_schema}.feature_sink_delta_uc_table" + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_delta_path_table_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_delta_path_table_main.json new file mode 100644 index 0000000..f98d844 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_delta_path_table_main.json @@ -0,0 +1,21 @@ +{ + "dataFlowId": "sink_delta_path_table", + "dataFlowGroup": "feature_samples_general_DISABLED", + "dataFlowType": "standard", + "sourceSystem": "test_system", + "sourceType": "delta", + "sourceViewName": "v_customer_delta_sink_path_table", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta_sink", + "targetDetails": { + "name": "sink_delta_path_table", + "sinkOptions": { + "path": "{staging_volume}/feature_sink_delta_path_table" + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_delta_uc_table_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_delta_uc_table_main.json new file mode 100644 index 0000000..635665e --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_delta_uc_table_main.json @@ -0,0 +1,21 @@ +{ + "dataFlowId": "sink_delta_uc_table", + "dataFlowGroup": "feature_samples_general_DISABLED", + "dataFlowType": "standard", + "sourceSystem": "test_system", + "sourceType": "delta", + "sourceViewName": "v_customer_delta_sink_uc_table", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta_sink", + "targetDetails": { + "name": "sink_delta_uc_table", + "sinkOptions": { + "tableName": "{bronze_schema}.feature_sink_delta_uc_table" + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_foreach_batch_single_basic_sql_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_foreach_batch_single_basic_sql_main.json new file mode 100644 index 0000000..291b62e --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_foreach_batch_single_basic_sql_main.json @@ -0,0 +1,27 @@ +{ + "dataFlowId": "for_each_batch_basic_sql", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "standard", + "sourceSystem": "test_system", + "sourceType": "delta", + "sourceViewName": "v_customer_purchase_basic_sql", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_purchase", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "foreach_batch_sink", + "targetDetails": { + "name": "for_each_batch_basic_sql", + "type": "basic_sql", + "config": { + "database": "{staging_schema}", + "table": "feature_foreach_batch_single_basic_sql", + "sqlPath": "./customer_purchase_transformed.sql", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_foreach_batch_single_python_function_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_foreach_batch_single_python_function_main.json new file mode 100644 index 0000000..ae20340 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/sink_foreach_batch_single_python_function_main.json @@ -0,0 +1,27 @@ +{ + "dataFlowId": "for_each_batch_python_function", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "standard", + "sourceSystem": "test_system", + "sourceType": "delta", + "sourceViewName": "v_customer_purchase_python_function", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer_purchase", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "foreach_batch_sink", + "targetDetails": { + "name": "for_each_batch_python_function", + "type": "python_function", + "config": { + "functionPath": "./feature_foreach_batch_python.py", + "tokens": { + "staging_schema": "{staging_schema}", + "bronze_schema": "{bronze_schema}", + "staging_volume": "{staging_volume}" + } + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/table_migration_append_only_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/table_migration_append_only_main.json new file mode 100644 index 0000000..7d93ad0 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/table_migration_append_only_main.json @@ -0,0 +1,37 @@ +{ + "dataFlowId": "table_migration_append_only", + "dataFlowGroup": "feature_samples_table_migration", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_feature_tm_customer_scd0", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true, + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "LOAD_TIMESTAMP" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_migrated_table_append_only", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "tableMigrationDetails": { + "catalogType": "uc", + "enabled": true, + "autoStartingVersionsEnabled": true, + "sourceDetails": { + "database": "{bronze_schema}", + "table": "table_to_migrate_scd0" + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/table_migration_scd2_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/table_migration_scd2_main.json new file mode 100644 index 0000000..ca37421 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/table_migration_scd2_main.json @@ -0,0 +1,55 @@ +{ + "dataFlowId": "table_migration_scd2", + "dataFlowGroup": "feature_samples_table_migration", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_feature_tm_customer_scd2", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true, + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "LOAD_TIMESTAMP" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_migrated_table_scd2", + "tableProperties": { + "delta.enableChangeDataFeed": "false" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "except_column_list": [], + "ignore_null_updates": true, + "apply_as_deletes": "" + }, + "tableMigrationDetails": { + "catalogType": "uc", + "enabled": true, + "autoStartingVersionsEnabled": true, + "sourceDetails": { + "database": "{bronze_schema}", + "table": "table_to_migrate_scd2", + "selectExp": [ + "CUSTOMER_ID", + "FIRST_NAME", + "LAST_NAME", + "EMAIL", + "EFFECTIVE_FROM AS __START_AT", + "EFFECTIVE_TO AS __END_AT" + ] + } + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/version_mapping_flows_dataflowspec_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/version_mapping_flows_dataflowspec_main.json new file mode 100644 index 0000000..ad8a054 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/version_mapping_flows_dataflowspec_main.json @@ -0,0 +1,82 @@ +{ + "dataFlowId": "version_mapping_flows", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "flow", + "dataFlowVersion": "0.1.0", + "targetFormat": "delta", + "targetDetails": { + "table": "feature_version_mapping_flows", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/feature_version_mapping_flows_schema.json" + }, + "cdcApplyChanges": { + "keys": [ + "CUSTOMER_ID" + ], + "sequence_by": "LOAD_TIMESTAMP", + "where": "", + "ignore_null_updates": false, + "scd_type": "1" + }, + "flowGroups": [ + { + "flowGroupId": "version_mapping_flows_1", + "stagingTables": { + "feature_version_mapping_stage": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcApplyChanges": { + "keys": [ + "CUSTOMER_ID" + ], + "sequence_by": "LOAD_TIMESTAMP", + "ignore_null_updates": true, + "scd_type": "1" + } + } + }, + "flows": { + "f_feature_version_mapping_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "feature_version_mapping_stage", + "sourceView": "v_flows_version_mapping" + }, + "views": { + "v_flows_version_mapping": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + }, + "f_target": { + "flowType": "merge", + "flowDetails": { + "targetTable": "feature_version_mapping_flows", + "sourceView": "v_version_mapping_final" + }, + "views": { + "v_version_mapping_final": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "live", + "table": "feature_version_mapping_stage", + "cdfEnabled": true + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/version_mapping_standard_dataflowspec_main.json b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/version_mapping_standard_dataflowspec_main.json new file mode 100644 index 0000000..3905003 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dataflowspec/version_mapping_standard_dataflowspec_main.json @@ -0,0 +1,31 @@ +{ + "dataFlowId": "version_mapping", + "dataFlowGroup": "feature_samples_general", + "dataFlowType": "standard", + "dataFlowVersion": "0.1.0", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_version_mapping_standard", + "sourceDetails": { + "database": "{staging_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "v_version_mapping_standard", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcApplyChanges": { + "keys": [ + "CUSTOMER_ID" + ], + "sequence_by": "LOAD_TIMESTAMP", + "where": "", + "ignore_null_updates": false, + "scd_type": "1" + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dml/customer_purchase_transformed.sql b/samples/bronze_sample/src/dataflows/feature_samples/dml/customer_purchase_transformed.sql new file mode 100644 index 0000000..9a4e59b --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dml/customer_purchase_transformed.sql @@ -0,0 +1,9 @@ +SELECT * +FROM ( + SELECT customer_id, product, quantity + FROM micro_batch_view +) src +PIVOT ( + SUM(quantity) + FOR product IN ('apples', 'bananas', 'oranges', 'pears') +) \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/dml/feature_mv_sql_path.sql b/samples/bronze_sample/src/dataflows/feature_samples/dml/feature_mv_sql_path.sql new file mode 100644 index 0000000..98bc595 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/dml/feature_mv_sql_path.sql @@ -0,0 +1 @@ +SELECT * FROM {staging_schema}.customer \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/expectations/customer_address_dqe.json b/samples/bronze_sample/src/dataflows/feature_samples/expectations/customer_address_dqe.json new file mode 100644 index 0000000..7405996 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/expectations/customer_address_dqe.json @@ -0,0 +1,15 @@ +{ + "expect_or_drop": [ + { + "name": "PK not null", + "constraint": "CUSTOMER_ID IS NOT NULL", + "tag": "Validity" + }, + { + "name": "enabledTest", + "constraint": "CUSTOMER_ID = 1", + "tag": "Validity", + "enabled": false + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_foreach_batch_python.py b/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_foreach_batch_python.py new file mode 100644 index 0000000..9fd61b9 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_foreach_batch_python.py @@ -0,0 +1,46 @@ +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import functions as F +from typing import Dict + +def micro_batch_function(df: DataFrame, batch_id: int, tokens: Dict) -> DataFrame: + bronze_schema = tokens["bronze_schema"] + staging_schema = tokens["staging_schema"] + staging_volume = tokens["staging_volume"] + volume_root_file_path = f"/Volumes/{staging_schema}/{staging_volume}".replace(".", "/") + spark = df.sparkSession + + df_transformed = ( + df.groupBy("customer_id") + .pivot("product", ["apples", "bananas", "oranges", "pears"]) + .agg(F.sum("quantity")) + ) + + # Support multiple writes without multiple reads + df_transformed.persist() + + # New UC Delta Table example + table_name = f"{bronze_schema}.feature_foreach_batch_python" + + write_command = df_transformed.write.format("delta") \ + .mode("append") + # You can add partitionBy and clusterBy here + + try: + spark.sql(f"DESCRIBE TABLE {table_name}") + write_command.saveAsTable(table_name) + except Exception: + # Create table if it does not exist + write_command.saveAsTable(table_name) + spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES (delta.enableChangeDataFeed = 'true')") + + # External Delta Table + df_transformed.write \ + .format("delta") \ + .mode("append") \ + .save(f"{volume_root_file_path}/feature_foreach_batch_python/delta_target") + + # JSON file location + df_transformed.write \ + .format("json") \ + .mode("append") \ + .save(f"{volume_root_file_path}/feature_foreach_batch_python/json_target") diff --git a/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_python_function_source.py b/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_python_function_source.py new file mode 100644 index 0000000..12bd91d --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_python_function_source.py @@ -0,0 +1,15 @@ +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import functions as F +from typing import Dict + +def get_df(spark: SparkSession, tokens: Dict) -> DataFrame: + """ + Get a DataFrame from the source details with applied transformations. + """ + source_table = tokens["sourceTable"] + reader_options = { + "readChangeFeed": "true" + } + + df = spark.readStream.options(**reader_options).table(source_table) + return df.withColumn("TEST_COLUMN", F.lit("testing...")) diff --git a/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_python_function_transform.py b/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_python_function_transform.py new file mode 100644 index 0000000..a4b15fd --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/python_functions/feature_python_function_transform.py @@ -0,0 +1,12 @@ +from pyspark.sql import DataFrame +from pyspark.sql import functions as F + +def apply_transform(df: DataFrame) -> DataFrame: + """ + Apply a transformation to the DataFrame. + """ + return ( + df.withWatermark("load_timestamp", "10 minutes") + .groupBy("CUSTOMER_ID") + .agg(F.count("*").alias("COUNT")) + ) diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/source/feature_historic_snapshot_customer_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/source/feature_historic_snapshot_customer_schema.json new file mode 100644 index 0000000..c9e47c0 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/source/feature_historic_snapshot_customer_schema.json @@ -0,0 +1,35 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "string", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/customer_address_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/customer_address_schema.json new file mode 100644 index 0000000..23597cf --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "CITY", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "STATE", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/customer_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/customer_schema.json new file mode 100644 index 0000000..cac697c --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/customer_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "string", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_ddl_schema.ddl b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_ddl_schema.ddl new file mode 100644 index 0000000..d525792 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_ddl_schema.ddl @@ -0,0 +1,8 @@ +CUSTOMER_ID integer NOT NULL, +FIRST_NAME string, +LAST_NAME string, +EMAIL string, +DELETE_FLAG boolean, +LOAD_TIMESTAMP timestamp, +YEAR_TEST INT GENERATED ALWAYS AS (YEAR(LOAD_TIMESTAMP)) +-- CONSTRAINT pk_customer PRIMARY KEY(CUSTOMER_ID) \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_historic_snapshot_customer_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_historic_snapshot_customer_schema.json new file mode 100644 index 0000000..8688cad --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_historic_snapshot_customer_schema.json @@ -0,0 +1,35 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_periodic_snapshot_customer_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_periodic_snapshot_customer_schema.json new file mode 100644 index 0000000..9194cb7 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_periodic_snapshot_customer_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_python_function_source_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_python_function_source_schema.json new file mode 100644 index 0000000..89c4923 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_python_function_source_schema.json @@ -0,0 +1,47 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "TEST_COLUMN", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_python_function_transform_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_python_function_transform_schema.json new file mode 100644 index 0000000..f4a74c3 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_python_function_transform_schema.json @@ -0,0 +1,17 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "COUNT", + "type": "long", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_version_mapping_flows_schema.json b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_version_mapping_flows_schema.json new file mode 100644 index 0000000..9194cb7 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/feature_samples/schemas/target/feature_version_mapping_flows_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/kafka_samples/dataflowspec/kafka_sink_main.json b/samples/bronze_sample/src/dataflows/kafka_samples/dataflowspec/kafka_sink_main.json new file mode 100644 index 0000000..fecd081 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/kafka_samples/dataflowspec/kafka_sink_main.json @@ -0,0 +1,51 @@ +{ + "dataFlowId": "kafka_sink", + "dataFlowGroup": "kafka_samples", + "dataFlowType": "flow", + "targetFormat": "kafka_sink", + "targetDetails": { + "name": "my_kafka_sink", + "sinkOptions": { + "topic": "{kafka_sink_topic}", + "kafka.bootstrap.servers": "{kafka_servers}", + "kafka.security.protocol": "SSL", + "kafka.ssl.keystore.secretRetrieval": { + "scope": "{kafka_keystore_password_secret_scope_name}", + "key": "{kafka_keystore_password_access_key_name}" + } + } + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "kafka_sink", + "flows": { + "f_kafka_sink_sample_source": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "my_kafka_sink", + "sourceView": "v_final_transform" + }, + "views": { + "v_kafka_sink_sample_source": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{staging_schema}", + "table": "kafka_sink_sample_source", + "cdfEnabled": true + } + }, + "v_final_transform": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "./tfm_final.sql" + } + } + } + } + } + } + ] +} diff --git a/samples/bronze_sample/src/dataflows/kafka_samples/dataflowspec/kafka_source_basic_main.json b/samples/bronze_sample/src/dataflows/kafka_samples/dataflowspec/kafka_source_basic_main.json new file mode 100644 index 0000000..9208a1d --- /dev/null +++ b/samples/bronze_sample/src/dataflows/kafka_samples/dataflowspec/kafka_source_basic_main.json @@ -0,0 +1,26 @@ +{ + "dataFlowId": "kafka_source_basic", + "dataFlowGroup": "kafka_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "kafka", + "sourceViewName": "v_topic_1", + "sourceDetails": { + "readerOptions": { + "kafka.bootstrap.servers": "${secret.kafka_source_bootstrap_servers}", + "kafka.security.protocol": "SSL", + "subscribe": "{kafka_source_topic}", + "startingOffsets": "earliest" + } + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "topic_1_staging", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "dataQualityExpectationsEnabled": false, + "quarantineMode": "off" +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/kafka_samples/dml/tfm_final.sql b/samples/bronze_sample/src/dataflows/kafka_samples/dml/tfm_final.sql new file mode 100644 index 0000000..39ef9e4 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/kafka_samples/dml/tfm_final.sql @@ -0,0 +1,9 @@ +SELECT + cast(Message_Id as string) as key, + to_json( + map_concat( + from_json(Message_payload, 'map'), -- Parse JSON string into a map + map('timestamp', cast(Message_Ts as string)) -- Add 'timestamp' as a string + ) + ) AS value +FROM stream(v_kafka_sink_sample_source) \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/kafka_samples/schemas/topic_sample_1_schema.json b/samples/bronze_sample/src/dataflows/kafka_samples/schemas/topic_sample_1_schema.json new file mode 100644 index 0000000..2ed4553 --- /dev/null +++ b/samples/bronze_sample/src/dataflows/kafka_samples/schemas/topic_sample_1_schema.json @@ -0,0 +1,19 @@ +{ + "key_schema": { + "type": "string" + }, + "value_schema": { + "type": "record", + "name": "User", + "fields": [ + { + "name": "action", + "type": "string" + }, + { + "name": "time", + "type": "long" + } + ] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/kafka_samples/schemas/topic_sample_1_schema_history.json b/samples/bronze_sample/src/dataflows/kafka_samples/schemas/topic_sample_1_schema_history.json new file mode 100644 index 0000000..a13ab9e --- /dev/null +++ b/samples/bronze_sample/src/dataflows/kafka_samples/schemas/topic_sample_1_schema_history.json @@ -0,0 +1,47 @@ +{ + "type": "struct", + "fields": [ + { + "name": "key", + "type": "binary", + "nullable": true, + "metadata": {} + }, + { + "name": "value", + "type": "binary", + "nullable": true, + "metadata": {} + }, + { + "name": "topic", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "partition", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "offset", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "timestamp", + "type": "timestamp", + "nullable": true, + "metadata": {} + }, + { + "name": "timestampType", + "type": "integer", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/template_samples/dataflowspec/template_samples_main.json b/samples/bronze_sample/src/dataflows/template_samples/dataflowspec/template_samples_main.json new file mode 100644 index 0000000..56eed0a --- /dev/null +++ b/samples/bronze_sample/src/dataflows/template_samples/dataflowspec/template_samples_main.json @@ -0,0 +1,22 @@ +{ + "template": "cdc_stream_from_snapshot_template", + "parameterSets": [ + { + "dataFlowId": "template_customer", + "sourceTable": "customer", + "targetTable": "customer_template", + "cdcKeys": ["CUSTOMER_ID"], + "sequenceByColumn": "LOAD_TIMESTAMP", + "schemaPath": "customer_schema.json" + }, + { + "dataFlowId": "template_customer_address", + "sourceTable": "customer_address", + "targetTable": "customer_address_template", + "cdcKeys": ["CUSTOMER_ID"], + "sequenceByColumn": "LOAD_TIMESTAMP", + "schemaPath": "customer_address_schema.json" + } + ] +} + diff --git a/samples/bronze_sample/src/dataflows/template_samples/schemas/customer_address_schema.json b/samples/bronze_sample/src/dataflows/template_samples/schemas/customer_address_schema.json new file mode 100644 index 0000000..23597cf --- /dev/null +++ b/samples/bronze_sample/src/dataflows/template_samples/schemas/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "CITY", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "STATE", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/dataflows/template_samples/schemas/customer_schema.json b/samples/bronze_sample/src/dataflows/template_samples/schemas/customer_schema.json new file mode 100644 index 0000000..8688cad --- /dev/null +++ b/samples/bronze_sample/src/dataflows/template_samples/schemas/customer_schema.json @@ -0,0 +1,35 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/bronze_sample/src/extensions/sources.py b/samples/bronze_sample/src/extensions/sources.py new file mode 100644 index 0000000..72538eb --- /dev/null +++ b/samples/bronze_sample/src/extensions/sources.py @@ -0,0 +1,31 @@ +""" +Python source extensions for the bronze sample pipeline. + +These functions are loaded via the pythonModule reference in dataflow specs +and are available because the extensions directory is added to sys.path +during pipeline initialization. +""" +from pyspark.sql import DataFrame, SparkSession +from pyspark.sql import functions as F +from typing import Dict + + +def get_customer_cdf(spark: SparkSession, tokens: Dict) -> DataFrame: + """ + Get customer data with Change Data Feed enabled. + + Args: + spark: SparkSession instance + tokens: Dictionary of tokens from the dataflow spec + + Returns: + DataFrame with customer data and a TEST_COLUMN added + """ + source_table = tokens["sourceTable"] + reader_options = { + "readChangeFeed": "true" + } + + df = spark.readStream.options(**reader_options).table(source_table) + return df.withColumn("TEST_COLUMN", F.lit("testing from extension...")) + diff --git a/samples/bronze_sample/src/extensions/transforms.py b/samples/bronze_sample/src/extensions/transforms.py new file mode 100644 index 0000000..0fff8df --- /dev/null +++ b/samples/bronze_sample/src/extensions/transforms.py @@ -0,0 +1,57 @@ +""" +Python transform extensions for the bronze sample pipeline. + +These functions are loaded via the pythonTransform.module reference in dataflow specs +and are available because the extensions directory is added to sys.path +during pipeline initialization. + +Transform functions receive a DataFrame and optionally tokens, and return a DataFrame. +""" +from pyspark.sql import DataFrame +from pyspark.sql import functions as F +from typing import Dict + + +def customer_aggregation(df: DataFrame) -> DataFrame: + """ + Apply customer aggregation transformation. + + Groups by CUSTOMER_ID and counts records within a 10-minute watermark window. + + Args: + df: Input DataFrame with customer data + + Returns: + DataFrame with CUSTOMER_ID and COUNT columns + """ + return ( + df.withWatermark("load_timestamp", "10 minutes") + .groupBy("CUSTOMER_ID") + .agg(F.count("*").alias("COUNT")) + ) + + +def customer_aggregation_with_tokens(df: DataFrame, tokens: Dict) -> DataFrame: + """ + Apply customer aggregation transformation with configurable parameters. + + Args: + df: Input DataFrame with customer data + tokens: Configuration tokens with: + - watermark_column: Column to use for watermark (default: load_timestamp) + - watermark_delay: Watermark delay duration (default: 10 minutes) + - group_by_column: Column to group by (default: CUSTOMER_ID) + + Returns: + DataFrame with grouped counts + """ + watermark_column = tokens.get("watermarkColumn", "load_timestamp") + watermark_delay = tokens.get("watermarkDelay", "10 minutes") + group_by_column = tokens.get("groupByColumn", "CUSTOMER_ID") + + return ( + df.withWatermark(watermark_column, watermark_delay) + .groupBy(group_by_column) + .agg(F.count("*").alias("COUNT")) + ) + diff --git a/samples/bronze_sample/src/pipeline_configs/dev_secrets.json b/samples/bronze_sample/src/pipeline_configs/dev_secrets.json new file mode 100644 index 0000000..e6bde87 --- /dev/null +++ b/samples/bronze_sample/src/pipeline_configs/dev_secrets.json @@ -0,0 +1,7 @@ +{ + "kafka_source_bootstrap_servers": { + "scope": "", + "key": "", + "exceptionEnabled": true + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/pipeline_configs/dev_substitutions.json b/samples/bronze_sample/src/pipeline_configs/dev_substitutions.json new file mode 100644 index 0000000..5ce2264 --- /dev/null +++ b/samples/bronze_sample/src/pipeline_configs/dev_substitutions.json @@ -0,0 +1,13 @@ +{ + "tokens": { + "staging_schema": "main.lakeflow_samples_staging{logical_env}", + "bronze_schema": "main.lakeflow_samples_bronze{logical_env}", + "silver_schema": "main.lakeflow_samples_silver{logical_env}", + "staging_volume": "stg_volume", + "sample_file_location": "/Volumes/main/lakeflow_samples_staging{logical_env}/stg_volume", + "kafka_sink_servers": "your_kafka_sink_server_urls", + "kafka_sink_topic": "your_topic_with_{logical_env}", + "kafka_source_servers": "your_kafka_source_server_urls", + "kafka_source_topic": "your_kafka_source_topic" + } +} diff --git a/samples/bronze_sample/src/pipeline_configs/global.json b/samples/bronze_sample/src/pipeline_configs/global.json new file mode 100644 index 0000000..8f5bb4b --- /dev/null +++ b/samples/bronze_sample/src/pipeline_configs/global.json @@ -0,0 +1,3 @@ +{ + "table_migration_state_volume_path": "/Volumes/main/lakeflow_samples_staging_es/stg_volume/checkpoint_state" +} \ No newline at end of file diff --git a/samples/bronze_sample/src/templates/cdc_stream_from_snapshot_template.json b/samples/bronze_sample/src/templates/cdc_stream_from_snapshot_template.json new file mode 100644 index 0000000..3fb2a87 --- /dev/null +++ b/samples/bronze_sample/src/templates/cdc_stream_from_snapshot_template.json @@ -0,0 +1,112 @@ +{ + "name": "cdc_stream_from_snapshot_template", + "parameters": { + "dataFlowId": { + "type": "string", + "required": true + }, + "cdcKeys": { + "type": "list", + "required": true + }, + "sourceTable": { + "type": "string", + "required": true + }, + "targetTable": { + "type": "string", + "required": true + }, + "sequenceByColumn": { + "type": "string", + "required": true + }, + "schemaPath": { + "type": "string", + "required": true + } + }, + "template": { + "dataFlowId": "${param.dataFlowId}", + "dataFlowGroup": "template_samples", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "${param.targetTable}", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": "${param.cdcKeys}", + "scd_type": "2", + "sequence_by": "${param.sequenceByColumn}", + "except_column_list": ["${param.sequenceByColumn}", "is_delete"], + "ignore_null_updates": false, + "apply_as_deletes": "is_delete = 1" + }, + "flowGroups": [ + { + "flowGroupId": "main", + "stagingTables": { + "stg_${param.dataFlowId}_${param.sourceTable}": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSnapshotSettings": { + "keys": "${param.cdcKeys}", + "scd_type": "1", + "snapshotType": "historical", + "sourceType": "file", + "source": { + "format": "csv", + "path": "{sample_file_location}/template_samples/snapshot_${param.sourceTable}/${param.sourceTable}_{version}.csv", + "readerOptions": { + "header": "true" + }, + "versionType": "timestamp", + "datetimeFormat": "%Y_%m_%d", + "schemaPath": "${param.schemaPath}", + "selectExp": [ + "* EXCEPT(${param.sequenceByColumn})", + "TO_TIMESTAMP(${param.sequenceByColumn}, 'yyyy-MM-dd HH:mm:ss') AS ${param.sequenceByColumn}", + "_metadata AS meta_file_metadata" + ] + } + }, + "configFlags": ["disableOperationalMetadata"] + } + }, + "flows": { + "f_${param.dataFlowId}_${param.sourceTable}_merge_flow": { + "flowType": "merge", + "flowDetails": { + "targetTable": "${param.targetTable}", + "sourceView": "v_stg_${param.dataFlowId}_${param.sourceTable}" + }, + "views": { + "v_stg_${param.dataFlowId}_${param.sourceTable}": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "live", + "table": "stg_${param.dataFlowId}_${param.sourceTable}", + "cdfEnabled": true, + "selectExp": [ + "*" + ], + "startingVersionFromDLTSetup": true, + "cdfChangeTypeOverride": ["insert", "update_postimage", "delete"], + "pythonTransform": { + "functionPath": "explode_deletes_function_transform.py" + } + } + } + } + } + } + } + ] + } +} \ No newline at end of file diff --git a/samples/bronze_sample/src/templates/python_functions/explode_deletes_function_transform.py b/samples/bronze_sample/src/templates/python_functions/explode_deletes_function_transform.py new file mode 100644 index 0000000..3663a26 --- /dev/null +++ b/samples/bronze_sample/src/templates/python_functions/explode_deletes_function_transform.py @@ -0,0 +1,27 @@ +from pyspark.sql import DataFrame +from pyspark.sql import functions as F + +def apply_transform(df: DataFrame) -> DataFrame: + """ + Duplicates delete records and adjusts sequence_by timestamp. + For deletes: is_delete=0 gets +1ms, is_delete=1 gets +2ms. + """ + # Create array: [0,1] for deletes, [0] for others, then explode + sequence_column = "LOAD_TIMESTAMP" + change_type_column = "meta_cdc_operation" + + is_delete = F.col(change_type_column) == "delete" + array_col = F.when(is_delete, F.array(F.lit(0), F.lit(1))).otherwise(F.array(F.lit(0))) + + return ( + df.withColumnRenamed("_change_type", change_type_column) + .withColumn("is_delete", F.explode(array_col)) + .withColumn( + sequence_column, + F.when(is_delete & (F.col("is_delete") == 0), + F.col(sequence_column) + F.expr("INTERVAL 1 millisecond")) + .when(is_delete & (F.col("is_delete") == 1), + F.col(sequence_column) + F.expr("INTERVAL 2 millisecond")) + .otherwise(F.col(sequence_column)) + ) + ) diff --git a/samples/bronze_sample/tests/main_test.py b/samples/bronze_sample/tests/main_test.py new file mode 100644 index 0000000..333ffa3 --- /dev/null +++ b/samples/bronze_sample/tests/main_test.py @@ -0,0 +1,6 @@ +from bronze_sample.main import get_taxis, get_spark + + +def test_main(): + taxis = get_taxis(get_spark()) + assert taxis.count() > 5 diff --git a/samples/common.sh b/samples/common.sh new file mode 100644 index 0000000..d56aa68 --- /dev/null +++ b/samples/common.sh @@ -0,0 +1,309 @@ +#!/bin/bash + +########## +# Common Library and Configuration for Lakeflow Framework Sample Deployments +########## + +# Configuration Constants +DEFAULT_SCHEMA_NAMESPACE="lakeflow_samples" +DEFAULT_PROFILE="DEFAULT" +DEFAULT_COMPUTE="1" # Serverless +DEFAULT_CATALOG="main" +FRAMEWORK_NAME="lakeflow_framework" +FRAMEWORK_TARGET="dev" + +# Common Variables +user="" +host="" +compute="" +profile="" +catalog="" +logical_env="" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Logging functions +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Parse common command-line arguments +parse_common_args() { + while [[ "$#" -gt 0 ]]; do + case $1 in + -u|--user) user="$2" + shift ;; + -h|--host) host="$2" + shift ;; + -c|--compute) compute="$2" + shift ;; + -p|--profile) profile="$2" + shift ;; + -l|--logical_env) logical_env="$2" + shift ;; + --catalog) catalog="$2" + shift ;; + --schema) schema="$2" + shift ;; + --schema_namespace) schema_namespace="$2" + shift ;; + *) echo "Unknown parameter: $1"; exit 1 ;; + esac + shift + done +} + +# Prompt for and validate common parameters +prompt_common_params() { + # Prompt for and validate user if not provided + [[ -z "$user" ]] && read -p "Databricks username: " user + [[ -z "$user" ]] && { log_error "Databricks username is required."; exit 1; } + + # Prompt for and validate workspace_host if not provided + [[ -z "$host" ]] && read -p "Databricks workspace host: " host + [[ -z "$host" ]] && { log_error "Databricks workspace host is required."; exit 1; } + + # Prompt for and validate profile if not provided + [[ -z "$profile" ]] && read -p "Databricks CLI profile (default: $DEFAULT_PROFILE): " profile + profile=${profile:-$DEFAULT_PROFILE} + + # Prompt for and validate compute if not provided + [[ -z "$compute" ]] && read -p "Select Compute (0=Classic, 1=Serverless, default: $DEFAULT_COMPUTE): " compute + compute=${compute:-$DEFAULT_COMPUTE} + + # Validate compute input + while [[ "$compute" != "1" && "$compute" != "0" ]]; do + read -p "Please select from (0=Classic, 1=Serverless): " compute + done + + # Prompt for and validate catalog if not provided + [[ -z "$catalog" ]] && read -p "UC catalog (default: $DEFAULT_CATALOG): " catalog + catalog=${catalog:-$DEFAULT_CATALOG} + + # Prompt for and validate schema_namespace if not provided + [[ -z "$schema_namespace" ]] && read -p "Schema Namespace (default: $DEFAULT_SCHEMA_NAMESPACE): " schema_namespace + schema_namespace=${schema_namespace:-${DEFAULT_SCHEMA_NAMESPACE}} + + # Prompt for logical_env if not provided + [[ -z "$logical_env" ]] && read -p "Logical environment (should start with '_'): " logical_env +} + +# Set up common bundle environment variables +setup_bundle_env() { + local bundle_name="$1" + local schema="$2" + + # In case of Git Bash, disable MSYS2 path conversion + export MSYS_NO_PATHCONV=1 + + # Set up Bundle Vars + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "$bundle_name Deployment" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "Bundle Environment:" + + export BUNDLE_VAR_logical_env=$logical_env + echo " - BUNDLE_VAR_logical_env: $BUNDLE_VAR_logical_env" + + export BUNDLE_VAR_catalog=$catalog + echo " - BUNDLE_VAR_catalog: $BUNDLE_VAR_catalog" + + if [[ -n "$schema" ]]; then + export BUNDLE_VAR_schema=$schema + echo " - BUNDLE_VAR_schema: $BUNDLE_VAR_schema" + fi + + if [[ -n "$schema_namespace" ]]; then + export BUNDLE_VAR_schema_namespace=$schema_namespace + echo " - BUNDLE_VAR_schema_namespace: $BUNDLE_VAR_schema_namespace" + fi + + # Use framework constants from config.sh + export BUNDLE_VAR_framework_source_path="/Workspace/Users/$user/.bundle/$FRAMEWORK_NAME/$FRAMEWORK_TARGET/current/files/src" + echo " - BUNDLE_VAR_framework_source_path: $BUNDLE_VAR_framework_source_path" + + export BUNDLE_VAR_workspace_host=$host + echo " - BUNDLE_VAR_workspace_host: $BUNDLE_VAR_workspace_host" + + echo "" +} + +# Deploy bundle with compute-specific resources +deploy_bundle() { + local bundle_name="$1" + + log_info "Deploying $bundle_name" + + # Remove resources subfolder under scratch if it exists + if [[ -d "scratch/resources" ]]; then + rm -rf scratch/resources + fi + + # Copy resource files to scratch folder based on compute setting + if [[ "$compute" == "0" ]]; then + log_info "Deploying to classic-compute target" + mkdir -p scratch/resources + cp resources/classic/*.yml scratch/resources/ + else + log_info "Deploying to serverless-compute target" + mkdir -p scratch/resources + cp resources/serverless/*.yml scratch/resources/ + fi + + # Deploy the bundle + if databricks bundle deploy -t dev --profile "$profile"; then + log_success "$bundle_name deployed successfully" + else + log_error "Failed to deploy $bundle_name" + return 1 + fi + + # Clean up resources + if [[ -d "scratch/resources" ]]; then + rm -rf scratch/resources + fi + + echo "" + + # In case of Git Bash, remove unset MSYS_NO_PATHCONV variable + unset MSYS_NO_PATHCONV +} + +# Validate required parameters +validate_required_params() { + local missing_params=() + + [[ -z "$user" ]] && missing_params+=("user") + [[ -z "$host" ]] && missing_params+=("host") + [[ -z "$compute" ]] && missing_params+=("compute") + [[ -z "$profile" ]] && missing_params+=("profile") + [[ -z "$catalog" ]] && missing_params+=("catalog") + [[ -z "$schema_namespace" ]] && missing_params+=("schema_namespace") + [[ -z "$logical_env" ]] && missing_params+=("logical_env") + + if [[ ${#missing_params[@]} -gt 0 ]]; then + log_error "Missing required parameters: ${missing_params[*]}" + return 1 + fi + + return 0 +} + +# Function to update substitutions file with catalog and schema namespace +update_substitutions_file() { + local substitutions_file="$1" + + # Only update if using non-default values + if [[ "$catalog" == "$DEFAULT_CATALOG" && "$schema_namespace" == "$DEFAULT_SCHEMA_NAMESPACE" ]]; then + return 0 + fi + + log_info "Updating substitutions file: $substitutions_file" + log_info "Using catalog: $catalog (default: $DEFAULT_CATALOG), schema namespace: $schema_namespace (default: $DEFAULT_SCHEMA_NAMESPACE)" + + # Check if file exists + if [[ ! -f "$substitutions_file" ]]; then + log_error "Substitutions file not found: $substitutions_file" + return 1 + fi + + # Handle backup file - the .backup file is the master original and must be preserved + if [[ -f "${substitutions_file}.backup" ]]; then + # A backup already exists from a previous run (possibly failed) + # The .backup is the original master - restore from it first to ensure clean state + log_warning "Existing backup found from previous run, restoring original before proceeding" + cp "${substitutions_file}.backup" "$substitutions_file" + log_info "Restored original from existing backup: ${substitutions_file}.backup" + else + # No backup exists - create one from the current file + cp "$substitutions_file" "${substitutions_file}.backup" + log_info "Created backup: ${substitutions_file}.backup" + fi + + # Detect file format (YAML vs JSON) and use appropriate sed patterns + if [[ "$substitutions_file" == *.yaml || "$substitutions_file" == *.yml ]]; then + # YAML format: key: value (with 2-space indent under tokens:) + log_info "Detected YAML format" + + # Update staging_schema + sed -i '' "s|staging_schema:.*|staging_schema: $catalog.${schema_namespace}_staging${logical_env}|" "$substitutions_file" + + # Update bronze_schema + sed -i '' "s|bronze_schema:.*|bronze_schema: $catalog.${schema_namespace}_bronze${logical_env}|" "$substitutions_file" + + # Update silver_schema + sed -i '' "s|silver_schema:.*|silver_schema: $catalog.${schema_namespace}_silver${logical_env}|" "$substitutions_file" + + # Update gold_schema (if present) + sed -i '' "s|gold_schema:.*|gold_schema: $catalog.${schema_namespace}_gold${logical_env}|" "$substitutions_file" + + # Update dpm_schema (if present) + sed -i '' "s|dpm_schema:.*|dpm_schema: $catalog.${schema_namespace}_dpm${logical_env}|" "$substitutions_file" + + # Update sample_file_location + sed -i '' "s|sample_file_location:.*|sample_file_location: /Volumes/$catalog/${schema_namespace}_staging${logical_env}/stg_volume|" "$substitutions_file" + else + # JSON format: "key": "value" + log_info "Detected JSON format" + + # Update staging_schema + sed -i '' "s|\"staging_schema\": \"[^\"]*\"|\"staging_schema\": \"$catalog.${schema_namespace}_staging${logical_env}\"|" "$substitutions_file" + + # Update bronze_schema + sed -i '' "s|\"bronze_schema\": \"[^\"]*\"|\"bronze_schema\": \"$catalog.${schema_namespace}_bronze${logical_env}\"|" "$substitutions_file" + + # Update silver_schema + sed -i '' "s|\"silver_schema\": \"[^\"]*\"|\"silver_schema\": \"$catalog.${schema_namespace}_silver${logical_env}\"|" "$substitutions_file" + + # Update gold_schema + sed -i '' "s|\"gold_schema\": \"[^\"]*\"|\"gold_schema\": \"$catalog.${schema_namespace}_gold${logical_env}\"|" "$substitutions_file" + + # Update dpm_schema + sed -i '' "s|\"dpm_schema\": \"[^\"]*\"|\"dpm_schema\": \"$catalog.${schema_namespace}_dpm${logical_env}\"|" "$substitutions_file" + + # Update sample_file_location + sed -i '' "s|\"sample_file_location\": \"[^\"]*\"|\"sample_file_location\": \"/Volumes/$catalog/${schema_namespace}_staging${logical_env}/stg_volume\"|" "$substitutions_file" + fi + + log_success "Successfully updated substitutions file" + # Set flag to indicate file was modified + export SUBSTITUTIONS_FILE_MODIFIED=true + + # Display the updated content + log_info "Updated substitutions file content:" + cat "$substitutions_file" + echo "" +} + +# Function to restore substitutions file from backup +restore_substitutions_file() { + local substitutions_file="$1" + + # Only restore if file was actually modified (backup exists and flag is set) + if [[ -f "${substitutions_file}.backup" && "$SUBSTITUTIONS_FILE_MODIFIED" == "true" ]]; then + log_info "Restoring original substitutions file" + cp "${substitutions_file}.backup" "$substitutions_file" + rm -f "${substitutions_file}.backup" + log_success "Restored original substitutions file" + # Clear the flag + unset SUBSTITUTIONS_FILE_MODIFIED + fi +} \ No newline at end of file diff --git a/samples/deploy.sh b/samples/deploy.sh new file mode 100755 index 0000000..b596d8d --- /dev/null +++ b/samples/deploy.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +########## +# Main Lakeflow Framework Deployment Script +########## + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Parse command-line arguments +parse_common_args "$@" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Display deployment summary +echo "" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "Lakeflow Framework Deployment" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "Deployment Configuration:" +echo " - User: $user" +echo " - Host: $host" +echo " - Profile: $profile" +echo " - Compute: $([ "$compute" == "0" ] && echo "Enhanced" || echo "Serverless")" +echo " - Catalog: $catalog" +echo " - Schema Namespace: $schema_namespace" +echo " - Logical Environment: $logical_env" +echo "" + +# Deploy Bronze Sample +./deploy_bronze.sh -u "$user" -h "$host" -p "$profile" -c "$compute" -l "$logical_env" --catalog "$catalog" --schema_namespace "$schema_namespace" + +# Deploy Silver Sample +./deploy_silver.sh -u "$user" -h "$host" -p "$profile" -c "$compute" -l "$logical_env" --catalog "$catalog" --schema_namespace "$schema_namespace" + +# Deploy Gold Sample +./deploy_gold.sh -u "$user" -h "$host" -p "$profile" -c "$compute" -l "$logical_env" --catalog "$catalog" --schema_namespace "$schema_namespace" + +# Deploy YAML Sample +./deploy_yaml.sh -u "$user" -h "$host" -p "$profile" -c "$compute" -l "$logical_env" --catalog "$catalog" --schema_namespace "$schema_namespace" + +# Deploy test data and orchestrator +./deploy_orchestrator.sh -u "$user" -h "$host" -p "$profile" -c "$compute" -l "$logical_env" --catalog "$catalog" --schema_namespace "$schema_namespace" diff --git a/samples/deploy_and_test.sh b/samples/deploy_and_test.sh new file mode 100755 index 0000000..39ab3e0 --- /dev/null +++ b/samples/deploy_and_test.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +########## +# Lakeflow Framework Deployment and Test Script +########## + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Additional variable for number of runs +num_runs=4 + +# Pre-process arguments to extract --runs and build array for common args +common_args=() +while [[ "$#" -gt 0 ]]; do + case $1 in + --runs) + num_runs="$2" + shift 2 + ;; + *) + common_args+=("$1") + shift + ;; + esac +done + +# Parse common arguments using the function from common.sh +parse_common_args "${common_args[@]}" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Validate num_runs +if ! [[ "$num_runs" =~ ^[1-4]$ ]]; then + log_error "Number of runs must be between 1 and 4" + exit 1 +fi + +# Display deployment and test summary +echo "" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "Lakeflow Framework Deployment and Test" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "Configuration:" +echo " - User: $user" +echo " - Host: $host" +echo " - Profile: $profile" +echo " - Compute: $([ "$compute" == "0" ] && echo "Enhanced" || echo "Serverless")" +echo " - Catalog: $catalog" +echo " - Schema Namespace: $schema_namespace" +echo " - Logical Environment: $logical_env" +echo " - Number of Runs: $num_runs" +echo "" + +# Step 1: Deploy using deploy.sh +log_info "Starting deployment..." +./deploy.sh -u "$user" -h "$host" -p "$profile" -c "$compute" -l "$logical_env" --catalog "$catalog" --schema_namespace "$schema_namespace" + +# if [ $? -ne 0 ]; then +# log_error "Deployment failed. Exiting." +# exit 1 +# fi + +log_success "Deployment completed successfully" +echo "" + +# Step 2: Execute run jobs sequentially +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "Executing Run Jobs" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "" + +# Change to the test_data_and_orchestrator directory +cd "$SCRIPT_DIR/test_data_and_orchestrator" || { + log_error "Failed to change directory to test_data_and_orchestrator" + exit 1 +} + +# Job keys for each run +declare -a job_keys=( + "lakeflow_samples_day_1_load_and_schema_initialization" + "lakeflow_samples_day_2_load" + "lakeflow_samples_day_3_load" + "lakeflow_samples_day_4_load" +) + +# Execute each run job +for ((i=1; i<=num_runs; i++)); do + job_key="${job_keys[$((i-1))]}" + + echo "" + log_info "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + log_info "Executing Run $i: $job_key" + log_info "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "" + + # Run the job and wait for completion + if databricks bundle run "$job_key" --profile "$profile" --var logical_env="$logical_env" 2>&1; then + log_success "Run $i completed successfully" + else + log_error "Run $i failed" + exit 1 + fi + + echo "" +done + +# Return to original directory +cd "$SCRIPT_DIR" || exit 1 + +echo "" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +log_success "All operations completed successfully!" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" diff --git a/samples/deploy_bronze.sh b/samples/deploy_bronze.sh new file mode 100755 index 0000000..b84cf49 --- /dev/null +++ b/samples/deploy_bronze.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Bronze Sample Bundle Deployment + +# Sample-specific constants +BUNDLE_NAME="Bronze Sample Bundle" +SCHEMA="${DEFAULT_SCHEMA_NAMESPACE}_bronze" + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Parse command-line arguments +parse_common_args "$@" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Set schema - use command line if provided, otherwise use constant with logical environment +if [[ -n "$schema_namespace" ]]; then + # Use schema from command line + schema="${schema_namespace}_bronze$logical_env" +else + # Use schema constant with logical environment + schema="$SCHEMA$logical_env" +fi + +# Set up bundle environment +setup_bundle_env "$BUNDLE_NAME" "$schema" + +# Update substitutions file with catalog and schema namespace +if ! update_substitutions_file "bronze_sample/src/pipeline_configs/dev_substitutions.json"; then + log_error "Failed to update substitutions file. Exiting." + exit 1 +fi + +# Change to bronze_sample directory for deployment +cd bronze_sample + +# Deploy the bundle +deploy_bundle "$BUNDLE_NAME" + +# Return to parent directory +cd .. + +# Restore original substitutions file +restore_substitutions_file "bronze_sample/src/pipeline_configs/dev_substitutions.json" \ No newline at end of file diff --git a/samples/deploy_gold.sh b/samples/deploy_gold.sh new file mode 100755 index 0000000..7a7433c --- /dev/null +++ b/samples/deploy_gold.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Gold Sample Bundle Deployment + +# Sample-specific constants +BUNDLE_NAME="Gold Sample Bundle" +SCHEMA="${DEFAULT_SCHEMA_NAMESPACE}_gold" + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Parse command-line arguments +parse_common_args "$@" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Set schema - use command line if provided, otherwise use constant with logical environment +if [[ -n "$schema_namespace" ]]; then + # Use schema from command line + schema="${schema_namespace}_gold$logical_env" +else + # Use schema constant with logical environment + schema="$SCHEMA$logical_env" +fi + +# Set up bundle environment +setup_bundle_env "$BUNDLE_NAME" "$schema" + +# Update substitutions file with catalog and schema namespace +if ! update_substitutions_file "gold_sample/src/pipeline_configs/dev_substitutions.json"; then + log_error "Failed to update substitutions file. Exiting." + exit 1 +fi + +# Change to gold_sample directory for deployment +cd gold_sample + +# Deploy the bundle +deploy_bundle "$BUNDLE_NAME" + +# Return to parent directory +cd .. + +# Restore original substitutions file +restore_substitutions_file "gold_sample/src/pipeline_configs/dev_substitutions.json" \ No newline at end of file diff --git a/samples/deploy_orchestrator.sh b/samples/deploy_orchestrator.sh new file mode 100755 index 0000000..43b0ce6 --- /dev/null +++ b/samples/deploy_orchestrator.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Test Data and Orchestrator Bundle Deployment + +# Sample-specific constants +BUNDLE_NAME="Test Data and Orchestrator Bundle" + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Parse command-line arguments +parse_common_args "$@" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Set schemas - use command line if provided, otherwise use constants with logical environment +if [[ -n "$schema_namespace" ]]; then + # Use schema from command line + schema_namespace="$schema_namespace" +else + # Use default schema namespace + schema_namespace="$DEFAULT_SCHEMA_NAMESPACE" +fi + +# Set up bundle environment with all schemas +setup_bundle_env "$BUNDLE_NAME" + +# Change to test_data_and_orchestrator directory for deployment +cd test_data_and_orchestrator + +# Deploy the bundle +deploy_bundle "$BUNDLE_NAME" + +# Return to parent directory +cd .. \ No newline at end of file diff --git a/samples/deploy_silver.sh b/samples/deploy_silver.sh new file mode 100755 index 0000000..403f902 --- /dev/null +++ b/samples/deploy_silver.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Silver Sample Bundle Deployment + +# Sample-specific constants +BUNDLE_NAME="Silver Sample Bundle" +SCHEMA="${DEFAULT_SCHEMA_NAMESPACE}_silver" + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Parse command-line arguments +parse_common_args "$@" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Set schema - use command line if provided, otherwise use constant with logical environment +if [[ -n "$schema_namespace" ]]; then + # Use schema from command line + schema="${schema_namespace}_silver$logical_env" +else + # Use schema constant with logical environment + schema="$SCHEMA$logical_env" +fi + +# Set up bundle environment +setup_bundle_env "$BUNDLE_NAME" "$schema" + +# Update substitutions file with catalog and schema namespace +if ! update_substitutions_file "silver_sample/src/pipeline_configs/dev_substitutions.json"; then + log_error "Failed to update substitutions file. Exiting." + exit 1 +fi + +# Change to silver_sample directory for deployment +cd silver_sample + +# Deploy the bundle +deploy_bundle "$BUNDLE_NAME" + +# Return to parent directory +cd .. + +# Restore original substitutions file +restore_substitutions_file "silver_sample/src/pipeline_configs/dev_substitutions.json" \ No newline at end of file diff --git a/samples/deploy_tpch.sh b/samples/deploy_tpch.sh new file mode 100755 index 0000000..4b9ca40 --- /dev/null +++ b/samples/deploy_tpch.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# TPCH Bundle Deployment + +# Sample-specific constants +BUNDLE_NAME="TPCH Bundle" +DEFAULT_TPCH_SCHEMA_NAMESPACE="lakeflow_samples_tpch" + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Parse command-line arguments +parse_common_args "$@" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Set schemas - use command line if provided, otherwise use constants with logical environment +if [[ -n "$schema_namespace" ]]; then + # Use schema from command line + schema_namespace="$schema_namespace" +else + # Use default schema namespace + schema_namespace="$DEFAULT_TPCH_SCHEMA_NAMESPACE" +fi + +# Set up bundle environment with all schemas +setup_bundle_env "$BUNDLE_NAME" + +# Update substitutions file with catalog and schema namespace +if ! update_substitutions_file "tpch_sample/src/pipeline_configs/dev_substitutions.json"; then + log_error "Failed to update substitutions file. Exiting." + exit 1 +fi + +# Change to test_data_and_orchestrator directory for deployment +cd tpch_sample + +# Deploy the bundle +deploy_bundle "$BUNDLE_NAME" + +# Return to parent directory +cd .. + +# Restore original substitutions file +restore_substitutions_file "tpch_sample/src/pipeline_configs/dev_substitutions.json" diff --git a/samples/deploy_yaml.sh b/samples/deploy_yaml.sh new file mode 100755 index 0000000..f49e68c --- /dev/null +++ b/samples/deploy_yaml.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# YAML Sample Bundle Deployment + +# Sample-specific constants +BUNDLE_NAME="YAML Sample Bundle" +SCHEMA="${DEFAULT_SCHEMA_NAMESPACE}_yaml" + +# Source common library and configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$SCRIPT_DIR/common.sh" + +# Parse command-line arguments +parse_common_args "$@" + +# Prompt for missing parameters +prompt_common_params + +# Validate all required parameters +if ! validate_required_params; then + exit 1 +fi + +# Set schema - use command line if provided, otherwise use constant with logical environment +if [[ -n "$schema_namespace" ]]; then + # Use schema from command line + schema="${schema_namespace}_yaml$logical_env" +else + # Use schema constant with logical environment + schema="$SCHEMA$logical_env" +fi + +# Set up bundle environment +setup_bundle_env "$BUNDLE_NAME" "$schema" + +# Update substitutions file with catalog and schema namespace +if ! update_substitutions_file "yaml_sample/src/pipeline_configs/dev_substitutions.yaml"; then + log_error "Failed to update substitutions file. Exiting." + exit 1 +fi + +# Change to yaml_sample directory for deployment +cd yaml_sample + +# Deploy the bundle +deploy_bundle "$BUNDLE_NAME" + +# Return to parent directory +cd .. + +# Restore original substitutions file +restore_substitutions_file "yaml_sample/src/pipeline_configs/dev_substitutions.yaml" \ No newline at end of file diff --git a/samples/destroy.sh b/samples/destroy.sh new file mode 100755 index 0000000..1be5180 --- /dev/null +++ b/samples/destroy.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Destroy Samples Bundle + +# Source common functions and constants +source "$(dirname "$0")/common.sh" + +# Parse command-line arguments using common function +parse_common_args "$@" + +# Prompt for and validate common parameters +prompt_common_params + +########## +# Set up Bundle Vars +setup_bundle_env "Destroy Samples" "" + +########## +# Destroy Test Data and Orchestrator +echo "Destroying Test Data and Orchestrator Bundle" +cd test_data_and_orchestrator +databricks bundle destroy -t dev --profile "$profile" --auto-approve +echo "" +cd .. + +########## +# Destroy Bronze Samples +echo "Destroying Bronze Sample Bundle" +export BUNDLE_VAR_schema="${catalog}.${schema_namespace}_bronze${logical_env}" +echo "BUNDLE_VAR_schema: $BUNDLE_VAR_schema" +cd bronze_sample +databricks bundle destroy -t dev --profile "$profile" --auto-approve +echo "" +cd .. + +########## +# Destroy Silver Samples +echo "Destroying Silver Sample Bundle" +export BUNDLE_VAR_schema="${catalog}.${schema_namespace}_silver${logical_env}" +echo "BUNDLE_VAR_schema: $BUNDLE_VAR_schema" +cd silver_sample +databricks bundle destroy -t dev --profile "$profile" --auto-approve +echo "" +cd .. + +########## +# Destroy Gold Samples +echo "Destroying Gold Sample Bundle" +export BUNDLE_VAR_schema="${catalog}.${schema_namespace}_gold${logical_env}" +echo "BUNDLE_VAR_schema: $BUNDLE_VAR_schema" +cd gold_sample +databricks bundle destroy -t dev --profile "$profile" --auto-approve +echo "" +cd .. diff --git a/samples/gold_sample/.gitignore b/samples/gold_sample/.gitignore new file mode 100644 index 0000000..bc4bd13 --- /dev/null +++ b/samples/gold_sample/.gitignore @@ -0,0 +1,7 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/ diff --git a/samples/gold_sample/.vscode/settings.json b/samples/gold_sample/.vscode/settings.json new file mode 100644 index 0000000..1a79a81 --- /dev/null +++ b/samples/gold_sample/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------" +} \ No newline at end of file diff --git a/samples/gold_sample/databricks.yml b/samples/gold_sample/databricks.yml new file mode 100644 index 0000000..6bbfdb7 --- /dev/null +++ b/samples/gold_sample/databricks.yml @@ -0,0 +1,37 @@ +# This is a Databricks asset bundle definition for silver_sample. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: gold_sample + +include: + - scratch/resources/*.yml + +variables: + catalog: + description: The target UC catalog + framework_source_path: + description: The full workspace path to the framwework src folder + schema: + description: The target UC schema + workspace_host: + description: workspace url used for API calls from Framework (usually same as deployment URL) e.g. https://e2-demo-field-eng.cloud.databricks.com/ + layer: + description: The target layer + default: gold + logical_env: + description: The logical environment + default: "" + pipeline_cluster_config: + description: Basic cluster config, add node types as necessary + default: + label: default + autoscale: + min_workers: 1 + max_workers: 5 + mode: ENHANCED + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + mode: development + default: true \ No newline at end of file diff --git a/samples/gold_sample/fixtures/.gitkeep b/samples/gold_sample/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/samples/gold_sample/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/samples/gold_sample/resources/classic/gold_stream_static_pipeline.yml b/samples/gold_sample/resources/classic/gold_stream_static_pipeline.yml new file mode 100644 index 0000000..941c6bf --- /dev/null +++ b/samples/gold_sample/resources/classic/gold_stream_static_pipeline.yml @@ -0,0 +1,23 @@ +resources: + pipelines: + lakeflow_samples_gold_stream_static_pipeline: + name: Lakeflow Framework - Gold - Stream Static Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: stream_static_samples + root_path: ${workspace.file_path}/src/dataflows/stream_static_samples + \ No newline at end of file diff --git a/samples/gold_sample/resources/serverless/gold_stream_static_pipeline.yml b/samples/gold_sample/resources/serverless/gold_stream_static_pipeline.yml new file mode 100644 index 0000000..d735ecc --- /dev/null +++ b/samples/gold_sample/resources/serverless/gold_stream_static_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_gold_stream_static_pipeline: + name: Lakeflow Framework - Gold - Stream Static Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: stream_static_samples + root_path: ${workspace.file_path}/src/dataflows/stream_static_samples diff --git a/samples/gold_sample/src/dataflows/stream_static_samples/dataflowspec/dim_customer_json_main.json b/samples/gold_sample/src/dataflows/stream_static_samples/dataflowspec/dim_customer_json_main.json new file mode 100644 index 0000000..cf2bfaa --- /dev/null +++ b/samples/gold_sample/src/dataflows/stream_static_samples/dataflowspec/dim_customer_json_main.json @@ -0,0 +1,156 @@ +{ + "dataFlowId": "dim_customer_stream_static_json", + "dataFlowGroup": "stream_static_samples", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "dim_customer_delta_join_sample", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"], + "ignore_null_updates": true, + "apply_as_deletes": "DELETE_FLAG = 1" + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "dim_customer_stream_static_json_1", + "stagingTables": { + "stg_customer_json_appnd_keys": { + "type": "ST" + }, + "stg_customer_json_merge_keys": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "sequence_by": "__START_AT", + "where": "", + "ignore_null_updates": true, + "except_column_list": ["__START_AT"], + "scd_type": "2", + "track_history_column_list": [], + "track_history_except_column_list": [] + } + } + }, + "flows": { + "f_customer": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_customer_json_appnd_keys", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer", + "cdfEnabled": true, + "selectExp": ["CUSTOMER_ID", "__START_AT"] + } + } + } + }, + "f_customer_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_customer_json_appnd_keys", + "sourceView": "v_customer_address" + }, + "views": { + "v_customer_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": ["CUSTOMER_ID", "__START_AT"] + } + } + } + }, + "f_merge_keys": { + "flowType": "merge", + "flowDetails": { + "targetTable": "stg_customer_json_merge_keys", + "sourceView": "stg_customer_json_appnd_keys" + } + }, + "f_target": { + "flowType": "merge", + "flowDetails": { + "targetTable": "dim_customer_delta_join_sample", + "sourceView": "v_transform" + }, + "views": { + "v_transform": { + "mode": "stream", + "sourceType": "deltaJoin", + "sourceDetails": { + "sources": [ + { + "database": "live", + "table": "stg_customer_json_merge_keys", + "cdfEnabled": true, + "alias": "drv", + "joinMode": "stream" + }, + { + "database": "{silver_schema}", + "table": "customer", + "cdfEnabled": false, + "alias": "c", + "joinMode": "static" + }, + { + "database": "{silver_schema}", + "table": "customer_address", + "cdfEnabled": false, + "alias": "ca", + "joinMode": "static" + } + ], + "joins": [ + { + "joinType": "inner", + "condition": "drv.CUSTOMER_ID = c.CUSTOMER_ID and drv.__START_AT = c.__START_AT" + }, + { + "joinType": "inner", + "condition": "drv.CUSTOMER_ID = ca.CUSTOMER_ID and drv.__START_AT = ca.__START_AT" + } + ], + "selectExp": [ + "drv.CUSTOMER_ID", + "drv.__START_AT", + "c.FIRST_NAME", + "c.LAST_NAME", + "c.EMAIL", + "ca.CITY", + "ca.STATE", + "c.DELETE_FLAG" + ] + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/gold_sample/src/dataflows/stream_static_samples/dataflowspec/dim_customer_sql_main.json b/samples/gold_sample/src/dataflows/stream_static_samples/dataflowspec/dim_customer_sql_main.json new file mode 100644 index 0000000..c5751e9 --- /dev/null +++ b/samples/gold_sample/src/dataflows/stream_static_samples/dataflowspec/dim_customer_sql_main.json @@ -0,0 +1,119 @@ +{ + "dataFlowId": "dim_customer_stream_static_sql", + "dataFlowGroup": "stream_static_samples", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "dim_customer_sql_sample", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"], + "ignore_null_updates": true, + "apply_as_deletes": "DELETE_FLAG = 1" + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "dim_customer_stream_static_sql_1", + "stagingTables": { + "stg_customer_sql_appnd_keys": { + "type": "ST" + }, + "stg_customer_sql_dedupe_keys": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"] + } + } + }, + "flows": { + "f_customer_1": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_customer_sql_appnd_keys", + "sourceView": "v_customer_1" + }, + "views": { + "v_customer_1": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer", + "cdfEnabled": true, + "selectExp": ["CUSTOMER_ID", "__START_AT"] + } + } + } + }, + "f_customer_address_1": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_customer_sql_appnd_keys", + "sourceView": "v_customer_address_1" + }, + "views": { + "v_customer_address_1": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": ["CUSTOMER_ID", "__START_AT"] + } + } + } + }, + "f_merge_keys_1": { + "flowType": "merge", + "flowDetails": { + "targetTable": "stg_customer_sql_dedupe_keys", + "sourceView": "stg_customer_sql_appnd_keys" + } + }, + "f_target_1": { + "flowType": "merge", + "flowDetails": { + "targetTable": "dim_customer_sql_sample", + "sourceView": "v_transform_1" + }, + "views": { + "v_final_cdf_feed": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "live", + "table": "stg_customer_sql_dedupe_keys", + "cdfEnabled": true + } + }, + "v_transform_1": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "dim_customer.sql" + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/gold_sample/src/dataflows/stream_static_samples/dml/dim_customer.sql b/samples/gold_sample/src/dataflows/stream_static_samples/dml/dim_customer.sql new file mode 100644 index 0000000..382c7ed --- /dev/null +++ b/samples/gold_sample/src/dataflows/stream_static_samples/dml/dim_customer.sql @@ -0,0 +1,13 @@ +SELECT + drv.CUSTOMER_ID, + c.FIRST_NAME, + c.LAST_NAME, + c.EMAIL, + ca.CITY, + ca.STATE, + c.DELETE_FLAG, + drv.__START_AT +FROM + STREAM(live.v_final_cdf_feed) AS drv + JOIN {bronze_schema}.customer AS c ON drv.CUSTOMER_ID = c.CUSTOMER_ID + JOIN {bronze_schema}.customer_address AS ca ON drv.CUSTOMER_ID = ca.CUSTOMER_ID \ No newline at end of file diff --git a/samples/gold_sample/src/pipeline_configs/dev_substitutions.json b/samples/gold_sample/src/pipeline_configs/dev_substitutions.json new file mode 100644 index 0000000..00476bd --- /dev/null +++ b/samples/gold_sample/src/pipeline_configs/dev_substitutions.json @@ -0,0 +1,6 @@ +{ + "tokens": { + "bronze_schema": "main.lakeflow_samples_bronze{logical_env}", + "silver_schema": "main.lakeflow_samples_silver{logical_env}" + } +} \ No newline at end of file diff --git a/samples/silver_sample/.gitignore b/samples/silver_sample/.gitignore new file mode 100644 index 0000000..bc4bd13 --- /dev/null +++ b/samples/silver_sample/.gitignore @@ -0,0 +1,7 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/ diff --git a/samples/silver_sample/.vscode/settings.json b/samples/silver_sample/.vscode/settings.json new file mode 100644 index 0000000..1a79a81 --- /dev/null +++ b/samples/silver_sample/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------" +} \ No newline at end of file diff --git a/samples/silver_sample/databricks.yml b/samples/silver_sample/databricks.yml new file mode 100644 index 0000000..580aa6b --- /dev/null +++ b/samples/silver_sample/databricks.yml @@ -0,0 +1,37 @@ +# This is a Databricks asset bundle definition for silver_sample. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: silver_sample + +include: + - scratch/resources/*.yml + +variables: + catalog: + description: The target UC catalog + framework_source_path: + description: The full workspace path to the framwework src folder + schema: + description: The target UC schema + workspace_host: + description: workspace url used for API calls from Framework (usually same as deployment URL) e.g. https://e2-demo-field-eng.cloud.databricks.com/ + layer: + description: The target layer + default: silver + logical_env: + description: The logical environment + default: "" + pipeline_cluster_config: + description: Basic cluster config, add node types as necessary + default: + label: default + autoscale: + min_workers: 1 + max_workers: 5 + mode: ENHANCED + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + mode: development + default: true \ No newline at end of file diff --git a/samples/silver_sample/fixtures/.gitkeep b/samples/silver_sample/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/samples/silver_sample/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/samples/silver_sample/resources/classic/silver_base_samples_pipeline.yml b/samples/silver_sample/resources/classic/silver_base_samples_pipeline.yml new file mode 100644 index 0000000..7320594 --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_base_samples_pipeline.yml @@ -0,0 +1,23 @@ +resources: + pipelines: + lakeflow_samples_silver_base_samples_pipeline: + name: Lakeflow Framework - Silver - Base Samples Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: base_samples + root_path: ${workspace.file_path}/src/dataflows/base_samples + \ No newline at end of file diff --git a/samples/silver_sample/resources/classic/silver_multi_source_streaming_basic_pipeline.yml b/samples/silver_sample/resources/classic/silver_multi_source_streaming_basic_pipeline.yml new file mode 100644 index 0000000..adeba30 --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_multi_source_streaming_basic_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_silver_multi_source_streaming_basic_pipeline: + name: Lakeflow Framework - Silver - Multi Source Streaming - Basic Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: multi_source_streaming_basic + root_path: ${workspace.file_path}/src/dataflows/multi_source_streaming_samples diff --git a/samples/silver_sample/resources/classic/silver_multi_source_streaming_decomp_final_pipeline.yml b/samples/silver_sample/resources/classic/silver_multi_source_streaming_decomp_final_pipeline.yml new file mode 100644 index 0000000..3da5987 --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_multi_source_streaming_decomp_final_pipeline.yml @@ -0,0 +1,23 @@ +resources: + pipelines: + lakeflow_samples_silver_customer_ms_decomp_final_pipeline: + name: Lakeflow Framework - Silver - Multi Source Streaming - Decomposed Final Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: multi_source_streaming_decomp_final + root_path: ${workspace.file_path}/src/dataflows/multi_source_streaming_samples + \ No newline at end of file diff --git a/samples/silver_sample/resources/classic/silver_multi_source_streaming_decomp_staging_pipeline.yml b/samples/silver_sample/resources/classic/silver_multi_source_streaming_decomp_staging_pipeline.yml new file mode 100644 index 0000000..805d768 --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_multi_source_streaming_decomp_staging_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline: + name: Lakeflow Framework - Silver - Multi Source Streaming - Decomposed Staging Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: multi_source_streaming_decomp_staging + root_path: ${workspace.file_path}/src/dataflows/multi_source_streaming_samples diff --git a/samples/silver_sample/resources/classic/silver_sample_customer_dpm_pipeline.yml b/samples/silver_sample/resources/classic/silver_sample_customer_dpm_pipeline.yml new file mode 100644 index 0000000..4576b64 --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_sample_customer_dpm_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_silver_customer_dpm_pipeline: + name: Lakeflow Framework - Silver - Customer DPM Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.dataFlowGroupFilter: dpm + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/silver_sample/resources/classic/silver_sample_customer_snapshot_flow_pipeline.yml b/samples/silver_sample/resources/classic/silver_sample_customer_snapshot_flow_pipeline.yml new file mode 100644 index 0000000..e8abf0b --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_sample_customer_snapshot_flow_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_silver_customer_snapshot_flow_pipeline: + name: lakeflow_samples_silver_customer_snapshot_flow_pipeline${var.logical_env} + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.dataFlowGroupFilter: csnap + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/silver_sample/resources/classic/silver_stream_static_basic_pipeline.yml b/samples/silver_sample/resources/classic/silver_stream_static_basic_pipeline.yml new file mode 100644 index 0000000..8f4d5aa --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_stream_static_basic_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_silver_stream_static_basic_pipeline: + name: Lakeflow Framework - Silver - Stream Static - Basic Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: stream_static_basic + root_path: ${workspace.file_path}/src/dataflows/stream_static_samples diff --git a/samples/silver_sample/resources/classic/silver_stream_static_streaming_dwh_pipeline.yml b/samples/silver_sample/resources/classic/silver_stream_static_streaming_dwh_pipeline.yml new file mode 100644 index 0000000..9b81868 --- /dev/null +++ b/samples/silver_sample/resources/classic/silver_stream_static_streaming_dwh_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_silver_stream_static_streaming_dwh_pipeline: + name: Lakeflow Framework - Silver - Stream Static - Streaming DWH Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: stream_static_streaming_dwh + root_path: ${workspace.file_path}/src/dataflows/stream_static_samples diff --git a/samples/silver_sample/resources/serverless/silver_base_samples_pipeline.yml b/samples/silver_sample/resources/serverless/silver_base_samples_pipeline.yml new file mode 100644 index 0000000..17387e0 --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_base_samples_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_silver_base_samples_pipeline: + name: Lakeflow Framework - Silver - Base Samples Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: base_samples + root_path: ${workspace.file_path}/src/dataflows/base_samples diff --git a/samples/silver_sample/resources/serverless/silver_multi_source_streaming_basic_pipeline.yml b/samples/silver_sample/resources/serverless/silver_multi_source_streaming_basic_pipeline.yml new file mode 100644 index 0000000..f5c67f4 --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_multi_source_streaming_basic_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_silver_multi_source_streaming_basic_pipeline: + name: Lakeflow Framework - Silver - Multi Source Streaming - Basic Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: multi_source_streaming_basic + root_path: ${workspace.file_path}/src/dataflows/multi_source_streaming_samples diff --git a/samples/silver_sample/resources/serverless/silver_multi_source_streaming_decomp_final_pipeline.yml b/samples/silver_sample/resources/serverless/silver_multi_source_streaming_decomp_final_pipeline.yml new file mode 100644 index 0000000..bb96ffc --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_multi_source_streaming_decomp_final_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_silver_customer_ms_decomp_final_pipeline: + name: Lakeflow Framework - Silver - Multi Source Streaming - Decomposed Final Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: multi_source_streaming_decomp_final + root_path: ${workspace.file_path}/src/dataflows/multi_source_streaming_samples + \ No newline at end of file diff --git a/samples/silver_sample/resources/serverless/silver_multi_source_streaming_decomp_staging_pipeline.yml b/samples/silver_sample/resources/serverless/silver_multi_source_streaming_decomp_staging_pipeline.yml new file mode 100644 index 0000000..07d2b1a --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_multi_source_streaming_decomp_staging_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline: + name: Lakeflow Framework - Silver - Multi Source Streaming - Decomposed Staging Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: multi_source_streaming_decomp_staging + root_path: ${workspace.file_path}/src/dataflows/multi_source_streaming_samples diff --git a/samples/silver_sample/resources/serverless/silver_sample_customer_dpm_pipeline.yml b/samples/silver_sample/resources/serverless/silver_sample_customer_dpm_pipeline.yml new file mode 100644 index 0000000..def1ae1 --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_sample_customer_dpm_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_silver_customer_dpm_pipeline: + name: Lakeflow Framework - Silver - Customer DPM Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.dataFlowGroupFilter: dpm + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/silver_sample/resources/serverless/silver_sample_customer_snapshot_flow_pipeline.yml b/samples/silver_sample/resources/serverless/silver_sample_customer_snapshot_flow_pipeline.yml new file mode 100644 index 0000000..7db140b --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_sample_customer_snapshot_flow_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_silver_customer_snapshot_flow_pipeline: + name: lakeflow_samples_silver_customer_snapshot_flow_pipeline${var.logical_env} + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.dataFlowGroupFilter: csnap + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + root_path: ${workspace.file_path}/src/dataflows/feature_samples diff --git a/samples/silver_sample/resources/serverless/silver_stream_static_basic_pipeline.yml b/samples/silver_sample/resources/serverless/silver_stream_static_basic_pipeline.yml new file mode 100644 index 0000000..40273d6 --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_stream_static_basic_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_silver_stream_static_basic_pipeline: + name: Lakeflow Framework - Silver - Stream Static - Basic Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: stream_static_basic + root_path: ${workspace.file_path}/src/dataflows/stream_static_samples diff --git a/samples/silver_sample/resources/serverless/silver_stream_static_streaming_dwh_pipeline.yml b/samples/silver_sample/resources/serverless/silver_stream_static_streaming_dwh_pipeline.yml new file mode 100644 index 0000000..f9077eb --- /dev/null +++ b/samples/silver_sample/resources/serverless/silver_stream_static_streaming_dwh_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_silver_stream_static_streaming_dwh_pipeline: + name: Lakeflow Framework - Silver - Stream Static - Streaming DWH Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowIdFilter: stream_static_streaming_dwh + root_path: ${workspace.file_path}/src/dataflows/stream_static_samples diff --git a/samples/silver_sample/resources/silver_sample_customer_migration_pipeline.yml b/samples/silver_sample/resources/silver_sample_customer_migration_pipeline.yml new file mode 100644 index 0000000..fe97bc6 --- /dev/null +++ b/samples/silver_sample/resources/silver_sample_customer_migration_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + dlt_framework_samples_silver_customer_migration_pipeline: + name: Lakeflow Framework Samples - Silver - Customer Migration Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + target: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.dataFlowGroupFilter: migration + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + root_path: ${workspace.file_path}/src/dataflows/feature_samples + \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/base_samples/dataflowspec/customer_address_main.json b/samples/silver_sample/src/dataflows/base_samples/dataflowspec/customer_address_main.json new file mode 100644 index 0000000..6daadc2 --- /dev/null +++ b/samples/silver_sample/src/dataflows/base_samples/dataflowspec/customer_address_main.json @@ -0,0 +1,32 @@ +{ + "dataFlowId": "base_customer_address", + "dataFlowGroup": "base_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_customer_address", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_address", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_address_schema.json", + "clusterByAuto": true + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "except_column_list": ["LOAD_TIMESTAMP"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/base_samples/dataflowspec/customer_main.json b/samples/silver_sample/src/dataflows/base_samples/dataflowspec/customer_main.json new file mode 100644 index 0000000..2b527a8 --- /dev/null +++ b/samples/silver_sample/src/dataflows/base_samples/dataflowspec/customer_main.json @@ -0,0 +1,32 @@ +{ + "dataFlowId": "base_customer", + "dataFlowGroup": "base_samples", + "dataFlowType": "standard", + "sourceSystem": "testSystem", + "sourceType": "delta", + "sourceViewName": "v_customer", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_schema.json" + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "except_column_list": ["LOAD_TIMESTAMP"], + "ignore_null_updates": false, + "apply_as_deletes": "DELETE_FLAG = 1" + } +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/base_samples/schemas/customer_address_schema.json b/samples/silver_sample/src/dataflows/base_samples/schemas/customer_address_schema.json new file mode 100644 index 0000000..23597cf --- /dev/null +++ b/samples/silver_sample/src/dataflows/base_samples/schemas/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "CITY", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "STATE", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/base_samples/schemas/customer_schema.json b/samples/silver_sample/src/dataflows/base_samples/schemas/customer_schema.json new file mode 100644 index 0000000..9194cb7 --- /dev/null +++ b/samples/silver_sample/src/dataflows/base_samples/schemas/customer_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_dpm_main.json b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_dpm_main.json new file mode 100644 index 0000000..c4d6d13 --- /dev/null +++ b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_dpm_main.json @@ -0,0 +1,82 @@ +{ + "dataFlowId": "dpm", + "dataFlowGroup": "dpm", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_dpm", + "database": "{dpm_schema}", + "schemaPath": "customer.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "except_column_list": ["LOAD_TIMESTAMP"], + "ignore_null_updates": true, + "apply_as_deletes": "DELETE_FLAG = 1" + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "dpm", + "stagingTables": { + "customer_dpm_appnd": { + "type": "ST", + "database": "{dpm_schema}" + } + }, + "flows": { + "f_customer": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "{dpm_schema}.customer_dpm_appnd", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + }, + "f_customer_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "{dpm_schema}.customer_dpm_appnd", + "sourceView": "v_customer_address" + }, + "views": { + "v_customer_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true, + "whereClause": ["STATE is not NULL"] + } + } + } + }, + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "{dpm_schema}.customer_dpm", + "sourceView": "{dpm_schema}.customer_dpm_appnd" + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_migration_main.json b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_migration_main.json new file mode 100644 index 0000000..ba65714 --- /dev/null +++ b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_migration_main.json @@ -0,0 +1,88 @@ +{ + "dataFlowId": "customer_migration_scd2", + "dataFlowGroup": "migration", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_migration", + "schemaPath": "customer.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "except_column_list": ["LOAD_TIMESTAMP"], + "ignore_null_updates": true, + "apply_as_deletes": "DELETE_FLAG = 1" + }, + "tableMigrationDetails": { + "enabled": true, + "catalogType": "uc", + "sourceDetails": { + "database": "{dpm_schema}", + "table": "customer_dpm" + } + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "dpm", + "stagingTables": { + "customer_migration_appnd": { + "type": "ST" + } + }, + "flows": { + "f_customer": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_migration_appnd", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + }, + "f_customer_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_migration_appnd", + "sourceView": "v_customer_address" + }, + "views": { + "v_customer_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true, + "whereClause": ["STATE is not NULL"] + } + } + } + }, + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_migration", + "sourceView": "customer_migration_appnd" + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_snapshot_flow_main.json b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_snapshot_flow_main.json new file mode 100644 index 0000000..2aca580 --- /dev/null +++ b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_snapshot_flow_main.json @@ -0,0 +1,46 @@ +{ + "dataFlowId": "csnap", + "dataFlowGroup": "csnap", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_snapshot_flow", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSnapshotSettings": { + "snapshotType": "periodic", + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2" + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "c_snapshot_flow", + "flows": { + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_snapshot_flow", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "batch", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": false, + "selectExp": ["*"] + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_snapshot_source_main.json b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_snapshot_source_main.json new file mode 100644 index 0000000..76f66fd --- /dev/null +++ b/samples/silver_sample/src/dataflows/feature_samples/dataflowspec/customer_snapshot_source_main.json @@ -0,0 +1,63 @@ +{ + "dataFlowId": "csnap_scd1_source", + "dataFlowGroup": "csnap", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_snapshot_scd1_source", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "UPDATE_TIMESTAMP", + "except_column_list": ["SNAPSHOT_VERSION", "SNAPSHOT_TIMESTAMP", "change_type"], + "apply_as_deletes": "change_type = 'delete'", + "ignore_null_updates": false + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "csnap_scd2_flow", + "stagingTables": { + "customer_snapshot_source_append": { + "type": "ST" + } + }, + "flows": { + "f_customer_snapshot_source_append": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_snapshot_source_append", + "sourceView": "v_customer_snapshot_source" + }, + "views": { + "v_customer_snapshot_source": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "feature_historical_snapshot_datetime_scd1", + "cdfEnabled": true, + "startingVersionFromDLTSetup": true, + "selectExp": ["*", "_change_type as change_type"], + "cdfChangeTypeOverride": ["insert", "update_postimage", "delete"] + } + } + } + }, + "f_customer_snapshot_source_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_snapshot_scd1_source", + "sourceView": "customer_snapshot_source_append" + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/feature_samples/dml/tfm_customer.sql b/samples/silver_sample/src/dataflows/feature_samples/dml/tfm_customer.sql new file mode 100644 index 0000000..b76c2e4 --- /dev/null +++ b/samples/silver_sample/src/dataflows/feature_samples/dml/tfm_customer.sql @@ -0,0 +1,10 @@ +SELECT + CUSTOMER_ID + , FIRST_NAME + , LAST_NAME + , EMAIL + , CITY + , STATE + , DELETE_FLAG + , __START_AT +FROM stream(live.v_final_cdf_feed) \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/feature_samples/expectations/customer_address_dqe.json b/samples/silver_sample/src/dataflows/feature_samples/expectations/customer_address_dqe.json new file mode 100644 index 0000000..7405996 --- /dev/null +++ b/samples/silver_sample/src/dataflows/feature_samples/expectations/customer_address_dqe.json @@ -0,0 +1,15 @@ +{ + "expect_or_drop": [ + { + "name": "PK not null", + "constraint": "CUSTOMER_ID IS NOT NULL", + "tag": "Validity" + }, + { + "name": "enabledTest", + "constraint": "CUSTOMER_ID = 1", + "tag": "Validity", + "enabled": false + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/feature_samples/schemas/customer.json b/samples/silver_sample/src/dataflows/feature_samples/schemas/customer.json new file mode 100644 index 0000000..ab0f3c6 --- /dev/null +++ b/samples/silver_sample/src/dataflows/feature_samples/schemas/customer.json @@ -0,0 +1,47 @@ +{ + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "CITY", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "STATE", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + } + ], + "type": "struct" +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_basic_main.json b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_basic_main.json new file mode 100644 index 0000000..f439d65 --- /dev/null +++ b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_basic_main.json @@ -0,0 +1,120 @@ +{ + "dataFlowId": "multi_source_streaming_basic", + "dataFlowGroup": "multi_source_streaming", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_ms_basic", + "schemaPath": "customer.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "clusterByAuto": true, + "clusterByColumns": ["CUSTOMER_ID"] + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "SEQUENCE_BY_COLUMN", + "except_column_list": ["SEQUENCE_BY_COLUMN"], + "ignore_null_updates": false, + "apply_as_deletes": "DELETE_FLAG = true" + }, + "flowGroups": [ + { + "flowGroupId": "multi_source_streaming_basic_1", + "stagingTables": { + "customer_ms_basic_appnd": { + "type": "ST" + }, + "customer_ms_basic_merge": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "sequence_by": "LOAD_TIMESTAMP", + "ignore_null_updates": true, + "scd_type": "2" + }, + "clusterByAuto": true + } + }, + "flows": { + "f_customer": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_ms_basic_appnd", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + }, + "f_customer_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_ms_basic_appnd", + "sourceView": "v_customer_address" + }, + "views": { + "v_customer_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true + } + } + } + }, + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_ms_basic_merge", + "sourceView": "customer_ms_basic_appnd" + } + }, + "f_target": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_ms_basic", + "sourceView": "v_final_transform" + }, + "views": { + "v_final_cdf_feed": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "live", + "table": "customer_ms_basic_merge", + "cdfEnabled": true + } + }, + "v_final_transform": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "./tfm_customer.sql" + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_decomposed_final_main.json b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_decomposed_final_main.json new file mode 100644 index 0000000..1947055 --- /dev/null +++ b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_decomposed_final_main.json @@ -0,0 +1,56 @@ +{ + "dataFlowId": "multi_source_streaming_decomp_final", + "dataFlowGroup": "multi_source_streaming", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_ms_decomp", + "schemaPath": "customer.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "SEQUENCE_BY_COLUMN", + "except_column_list": ["SEQUENCE_BY_COLUMN"], + "ignore_null_updates": true, + "apply_as_deletes": "DELETE_FLAG = 1" + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "multi_source_streaming_decomp_final_1", + "flows": { + "f_target": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_ms_decomp", + "sourceView": "v_final_transform" + }, + "views": { + "v_final_cdf_feed": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer_ms_decomp_merge", + "cdfEnabled": true + } + }, + "v_final_transform": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "./tfm_customer.sql" + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_decomposed_staging_main.json b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_decomposed_staging_main.json new file mode 100644 index 0000000..fdb04c7 --- /dev/null +++ b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dataflowspec/customer_multi_streaming_decomposed_staging_main.json @@ -0,0 +1,79 @@ +{ + "dataFlowId": "multi_source_streaming_decomp_staging", + "dataFlowGroup": "multi_source_streaming", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_ms_decomp_merge", + "schemaPath": "customer.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "except_column_list": ["LOAD_TIMESTAMP"], + "ignore_null_updates": true + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "multi_source_streaming_decomp_staging_1", + "stagingTables": { + "customer_ms_decomp_appnd": { + "type": "ST" + } + }, + "flows": { + "f_customer": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_ms_decomp_appnd", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true + } + } + } + }, + "f_customer_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_ms_decomp_appnd", + "sourceView": "v_customer_address" + }, + "views": { + "v_customer_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true, + "whereClause": ["STATE is not NULL"] + } + } + } + }, + "f_merge": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_ms_decomp_merge", + "sourceView": "customer_ms_decomp_appnd" + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dml/tfm_customer.sql b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dml/tfm_customer.sql new file mode 100644 index 0000000..405af84 --- /dev/null +++ b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/dml/tfm_customer.sql @@ -0,0 +1,10 @@ +SELECT + CUSTOMER_ID + , FIRST_NAME + , LAST_NAME + , EMAIL + , CITY + , STATE + , DELETE_FLAG + , __START_AT AS SEQUENCE_BY_COLUMN +FROM stream(live.v_final_cdf_feed) \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/multi_source_streaming_samples/schemas/customer.json b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/schemas/customer.json new file mode 100644 index 0000000..ab0f3c6 --- /dev/null +++ b/samples/silver_sample/src/dataflows/multi_source_streaming_samples/schemas/customer.json @@ -0,0 +1,47 @@ +{ + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "CITY", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "STATE", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + } + ], + "type": "struct" +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/stream_static_samples/dataflowspec/customer_stream_static_basic_main.json b/samples/silver_sample/src/dataflows/stream_static_samples/dataflowspec/customer_stream_static_basic_main.json new file mode 100644 index 0000000..2059124 --- /dev/null +++ b/samples/silver_sample/src/dataflows/stream_static_samples/dataflowspec/customer_stream_static_basic_main.json @@ -0,0 +1,77 @@ +{ + "dataFlowId": "stream_static_basic", + "dataFlowGroup": "stream_static_samples", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_ss_basic", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "except_column_list": ["LOAD_TIMESTAMP"], + "ignore_null_updates": false, + "apply_as_deletes": "DELETE_FLAG = 1" + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "stream_static_basic_1", + "flows": { + "f_customer": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_ss_basic", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "deltaJoin", + "sourceDetails": { + "sources": [ + { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true, + "alias": "c", + "joinMode": "stream" + }, + { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": false, + "alias": "ca", + "joinMode": "static" + } + ], + "joins": [ + { + "joinType": "left", + "condition": "c.CUSTOMER_ID = ca.CUSTOMER_ID" + } + ], + "selectExp": [ + "c.CUSTOMER_ID", + "c.FIRST_NAME", + "c.LAST_NAME", + "c.EMAIL", + "c.DELETE_FLAG", + "ca.CITY", + "ca.STATE", + "c.LOAD_TIMESTAMP" + ] + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/dataflows/stream_static_samples/dataflowspec/customer_stream_static_streaming_dwh_main.json b/samples/silver_sample/src/dataflows/stream_static_samples/dataflowspec/customer_stream_static_streaming_dwh_main.json new file mode 100644 index 0000000..3342d0f --- /dev/null +++ b/samples/silver_sample/src/dataflows/stream_static_samples/dataflowspec/customer_stream_static_streaming_dwh_main.json @@ -0,0 +1,153 @@ +{ + "dataFlowId": "stream_static_streaming_dwh", + "dataFlowGroup": "stream_static_samples", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_ss_dwh", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"], + "ignore_null_updates": false, + "apply_as_deletes": "DELETE_FLAG = 1" + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "stream_static_streaming_dwh_1", + "stagingTables": { + "customer_ss_dwh_appnd_keys": { + "type": "ST" + }, + "customer_ss_dwh_merge_keys": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSettings": { + "keys": [ + "CUSTOMER_ID" + ], + "scd_type": "2", + "sequence_by": "LOAD_TIMESTAMP", + "where": "", + "except_column_list": ["LOAD_TIMESTAMP"] + } + } + }, + "flows": { + "f_customer": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_ss_dwh_appnd_keys", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true, + "selectExp": ["CUSTOMER_ID", "LOAD_TIMESTAMP"] + } + } + } + }, + "f_customer_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "customer_ss_dwh_appnd_keys", + "sourceView": "v_customer_address" + }, + "views": { + "v_customer_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": ["CUSTOMER_ID", "LOAD_TIMESTAMP"] + } + } + } + }, + "f_merge_keys": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_ss_dwh_merge_keys", + "sourceView": "customer_ss_dwh_appnd_keys" + } + }, + "f_target": { + "flowType": "merge", + "flowDetails": { + "targetTable": "customer_ss_dwh", + "sourceView": "v_transform" + }, + "views": { + "v_transform": { + "mode": "stream", + "sourceType": "deltaJoin", + "sourceDetails": { + "sources": [ + { + "database": "live", + "table": "customer_ss_dwh_merge_keys", + "cdfEnabled": true, + "alias": "drv", + "joinMode": "stream" + }, + { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": false, + "alias": "c", + "joinMode": "static" + }, + { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": false, + "alias": "ca", + "joinMode": "static" + } + ], + "joins": [ + { + "joinType": "inner", + "condition": "drv.CUSTOMER_ID = c.CUSTOMER_ID" + }, + { + "joinType": "inner", + "condition": "drv.CUSTOMER_ID = ca.CUSTOMER_ID" + } + ], + "selectExp": [ + "c.CUSTOMER_ID", + "c.FIRST_NAME", + "c.LAST_NAME", + "c.EMAIL", + "ca.CITY", + "ca.STATE", + "c.DELETE_FLAG", + "drv.__START_AT" + ] + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/silver_sample/src/pipeline_configs/dev_substitutions.json b/samples/silver_sample/src/pipeline_configs/dev_substitutions.json new file mode 100644 index 0000000..d0a617e --- /dev/null +++ b/samples/silver_sample/src/pipeline_configs/dev_substitutions.json @@ -0,0 +1,7 @@ +{ + "tokens": { + "bronze_schema": "main.lakeflow_samples_bronze{logical_env}", + "silver_schema": "main.lakeflow_samples_silver{logical_env}", + "dpm_schema": "main.lakeflow_samples_dpm{logical_env}" + } +} \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/.gitignore b/samples/test_data_and_orchestrator/.gitignore new file mode 100644 index 0000000..0dab7f4 --- /dev/null +++ b/samples/test_data_and_orchestrator/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md diff --git a/samples/test_data_and_orchestrator/.vscode/__builtins__.pyi b/samples/test_data_and_orchestrator/.vscode/__builtins__.pyi new file mode 100644 index 0000000..0edd518 --- /dev/null +++ b/samples/test_data_and_orchestrator/.vscode/__builtins__.pyi @@ -0,0 +1,3 @@ +# Typings for Pylance in Visual Studio Code +# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md +from databricks.sdk.runtime import * diff --git a/samples/test_data_and_orchestrator/.vscode/extensions.json b/samples/test_data_and_orchestrator/.vscode/extensions.json new file mode 100644 index 0000000..5d15eba --- /dev/null +++ b/samples/test_data_and_orchestrator/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "databricks.databricks", + "ms-python.vscode-pylance", + "redhat.vscode-yaml" + ] +} diff --git a/samples/test_data_and_orchestrator/.vscode/settings.json b/samples/test_data_and_orchestrator/.vscode/settings.json new file mode 100644 index 0000000..f19498d --- /dev/null +++ b/samples/test_data_and_orchestrator/.vscode/settings.json @@ -0,0 +1,17 @@ +{ + "python.analysis.stubPath": ".vscode", + "databricks.python.envFile": "${workspaceFolder}/.env", + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "python.analysis.extraPaths": ["src"], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + }, +} diff --git a/samples/test_data_and_orchestrator/databricks.yml b/samples/test_data_and_orchestrator/databricks.yml new file mode 100644 index 0000000..97318be --- /dev/null +++ b/samples/test_data_and_orchestrator/databricks.yml @@ -0,0 +1,96 @@ +# This is a Databricks asset bundle definition for bronze_sample. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: test_data_and_orchestrator + +include: + - scratch/resources/*.yml + +variables: + # Core variables + catalog: + description: The target UC catalog + default: "main" + schema_namespace: + description: The prefix for all schemas which will end with one of (_bronze, _silver, _gold, or _dpm) + default: "lakeflow_samples" + logical_env: + description: The logical environment + default: "" + job_cluster_name: + description: The name of the job cluster + default: "lakehouse_framework_samples_job_cluster" + job_cluster_config: + description: Complete cluster configuration for compute clusters + default: + spark_version: "16.4.x-scala2.12" + node_type_id: "i3.xlarge" + num_workers: 1 + autoscale: + min_workers: 1 + max_workers: 5 + + # name prefix variable + name_prefix: + description: The name prefix + default: "[${bundle.target} ${workspace.current_user.short_name}] " + + # SDP Bronze pipeline ID lookup variables + lakeflow_samples_bronze_base_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Bronze - Base Pipeline (${var.logical_env})" + lakeflow_samples_feature_samples_pipeline_general_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Feature Samples - General Pipeline (${var.logical_env})" + lakeflow_samples_feature_samples_pipeline_data_quality_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Feature Samples - Data Quality Pipeline (${var.logical_env})" + lakeflow_samples_feature_samples_pipeline_snapshots_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Feature Samples - Snapshots Pipeline (${var.logical_env})" + lakeflow_samples_feature_samples_pipeline_table_migration_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Feature Samples - Table Migration Pipeline (${var.logical_env})" + lakeflow_samples_bronze_kafka_samples_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Bronze - Kafka Samples Pipeline (${var.logical_env})" + lakeflow_samples_yaml_sample_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - YAML Sample Pipeline (${var.logical_env})" + lakeflow_samples_bronze_template_samples_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Bronze - Template Samples Pipeline (${var.logical_env})" + + # SDP Silver pipeline ID lookup variables + lakeflow_samples_silver_base_samples_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Silver - Base Samples Pipeline (${var.logical_env})" + lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Silver - Multi Source Streaming - Basic Pipeline (${var.logical_env})" + lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Silver - Multi Source Streaming - Decomposed Final Pipeline (${var.logical_env})" + lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Silver - Multi Source Streaming - Decomposed Staging Pipeline (${var.logical_env})" + lakeflow_samples_silver_stream_static_basic_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Silver - Stream Static - Basic Pipeline (${var.logical_env})" + lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Silver - Stream Static - Streaming DWH Pipeline (${var.logical_env})" + lakeflow_samples_silver_customer_dpm_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Silver - Customer DPM Pipeline (${var.logical_env})" + + # Gold pipeline ID lookup variables + lakeflow_samples_gold_stream_static_pipeline_id: + lookup: + pipeline: "${var.name_prefix}Lakeflow Framework - Gold - Stream Static Pipeline (${var.logical_env})" + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + mode: development + default: true \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/fixtures/.gitkeep b/samples/test_data_and_orchestrator/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/samples/test_data_and_orchestrator/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/samples/test_data_and_orchestrator/resources/classic/kafka_samples_run.yml b/samples/test_data_and_orchestrator/resources/classic/kafka_samples_run.yml new file mode 100644 index 0000000..e235b85 --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/classic/kafka_samples_run.yml @@ -0,0 +1,34 @@ +resources: + jobs: + kafka_samples_run: + name: Lakeflow Framework - Kafka Samples(${var.logical_env}) + job_clusters: + - job_cluster_key: ${var.job_cluster_name} + new_cluster: ${var.job_cluster_config} + tasks: + - task_key: create_schemas_and_tables + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/create_schemas_and_tables + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + - task_key: staging_load + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/run_1_staging_load + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + depends_on: + - task_key: create_schemas_and_tables + + # Bronze pipelines + - task_key: lakeflow_samples_bronze_kafka_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_kafka_samples_pipeline_id} + full_refresh: true + depends_on: + - task_key: staging_load diff --git a/samples/test_data_and_orchestrator/resources/classic/run_1_load_and_schema_initialization_job.yml b/samples/test_data_and_orchestrator/resources/classic/run_1_load_and_schema_initialization_job.yml new file mode 100644 index 0000000..1c30333 --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/classic/run_1_load_and_schema_initialization_job.yml @@ -0,0 +1,120 @@ +resources: + jobs: + lakeflow_samples_day_1_load_and_schema_initialization: + name: Lakeflow Framework - Run 1 - Load and Schema Initialization (${var.logical_env}) + job_clusters: + - job_cluster_key: ${var.job_cluster_name} + new_cluster: ${var.job_cluster_config} + tasks: + - task_key: create_schemas_and_tables + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/create_schemas_and_tables + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + - task_key: staging_load + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/run_1_staging_load + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + depends_on: + - task_key: create_schemas_and_tables + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: true + depends_on: + - task_key: staging_load + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: true + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_customer_dpm_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_customer_dpm_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: true + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true + \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/resources/classic/run_2_load_job.yml b/samples/test_data_and_orchestrator/resources/classic/run_2_load_job.yml new file mode 100644 index 0000000..f7b68b9 --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/classic/run_2_load_job.yml @@ -0,0 +1,102 @@ +resources: + jobs: + lakeflow_samples_day_2_load: + name: Lakeflow Framework - Run 2 - Load (${var.logical_env}) + job_clusters: + - job_cluster_key: ${var.job_cluster_name} + new_cluster: ${var.job_cluster_config} + tasks: + - task_key: staging_load + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/run_2_staging_load + base_parameters: + catalog: ${var.catalog} + logical_env: ${var.logical_env} + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/resources/classic/run_3_load_job.yml b/samples/test_data_and_orchestrator/resources/classic/run_3_load_job.yml new file mode 100644 index 0000000..e7dab25 --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/classic/run_3_load_job.yml @@ -0,0 +1,102 @@ +resources: + jobs: + lakeflow_samples_day_3_load: + name: Lakeflow Framework - Run 3 - Load (${var.logical_env}) + job_clusters: + - job_cluster_key: ${var.job_cluster_name} + new_cluster: ${var.job_cluster_config} + tasks: + - task_key: staging_load + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/run_3_staging_load + base_parameters: + catalog: ${var.catalog} + logical_env: ${var.logical_env} + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/resources/classic/run_4_load_job.yml b/samples/test_data_and_orchestrator/resources/classic/run_4_load_job.yml new file mode 100644 index 0000000..e7d82a0 --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/classic/run_4_load_job.yml @@ -0,0 +1,102 @@ +resources: + jobs: + lakeflow_samples_day_4_load: + name: Lakeflow Framework - Run 4 - Load (${var.logical_env}) + job_clusters: + - job_cluster_key: ${var.job_cluster_name} + new_cluster: ${var.job_cluster_config} + tasks: + - task_key: staging_load + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/run_4_staging_load + base_parameters: + catalog: ${var.catalog} + logical_env: ${var.logical_env} + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/resources/serverless/kafka_samples_run.yml b/samples/test_data_and_orchestrator/resources/serverless/kafka_samples_run.yml new file mode 100644 index 0000000..9934fa4 --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/serverless/kafka_samples_run.yml @@ -0,0 +1,29 @@ +resources: + jobs: + kafka_samples_run: + name: Lakeflow Framework - Kafka Samples(${var.logical_env}) + tasks: + - task_key: create_schemas_and_tables + notebook_task: + notebook_path: ${workspace.file_path}/src/create_schemas_and_tables + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + - task_key: staging_load + notebook_task: + notebook_path: ${workspace.file_path}/src/run_1_staging_load + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + depends_on: + - task_key: create_schemas_and_tables + + # Bronze pipelines + - task_key: lakeflow_samples_bronze_kafka_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_kafka_samples_pipeline_id} + full_refresh: true + depends_on: + - task_key: staging_load diff --git a/samples/test_data_and_orchestrator/resources/serverless/run_1_load_and_schema_initialization_job.yml b/samples/test_data_and_orchestrator/resources/serverless/run_1_load_and_schema_initialization_job.yml new file mode 100644 index 0000000..588c82d --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/serverless/run_1_load_and_schema_initialization_job.yml @@ -0,0 +1,122 @@ +resources: + jobs: + lakeflow_samples_day_1_load_and_schema_initialization: + name: Lakeflow Framework - Run 1 - Load and Schema Initialization (${var.logical_env}) + tasks: + - task_key: create_schemas_and_tables + notebook_task: + notebook_path: ${workspace.file_path}/src/create_schemas_and_tables + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + - task_key: staging_load + notebook_task: + notebook_path: ${workspace.file_path}/src/run_1_staging_load + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + depends_on: + - task_key: create_schemas_and_tables + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: true + depends_on: + - task_key: staging_load + - task_key: bronze_template_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_template_samples_pipeline_id} + full_refresh: true + depends_on: + - task_key: staging_load + + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: true + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_customer_dpm_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_customer_dpm_pipeline_id} + full_refresh: true + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: true + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true + \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/resources/serverless/run_2_load_job.yml b/samples/test_data_and_orchestrator/resources/serverless/run_2_load_job.yml new file mode 100644 index 0000000..6c0acda --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/serverless/run_2_load_job.yml @@ -0,0 +1,104 @@ +resources: + jobs: + lakeflow_samples_day_2_load: + name: Lakeflow Framework - Run 2 - Load (${var.logical_env}) + tasks: + - task_key: staging_load + notebook_task: + notebook_path: ${workspace.file_path}/src/run_2_staging_load + base_parameters: + catalog: ${var.catalog} + logical_env: ${var.logical_env} + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: bronze_template_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_template_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/resources/serverless/run_3_load_job.yml b/samples/test_data_and_orchestrator/resources/serverless/run_3_load_job.yml new file mode 100644 index 0000000..94c0bbf --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/serverless/run_3_load_job.yml @@ -0,0 +1,104 @@ +resources: + jobs: + lakeflow_samples_day_3_load: + name: Lakeflow Framework - Run 3 - Load (${var.logical_env}) + tasks: + - task_key: staging_load + notebook_task: + notebook_path: ${workspace.file_path}/src/run_3_staging_load + base_parameters: + catalog: ${var.catalog} + logical_env: ${var.logical_env} + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: bronze_template_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_template_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/resources/serverless/run_4_load_job.yml b/samples/test_data_and_orchestrator/resources/serverless/run_4_load_job.yml new file mode 100644 index 0000000..12ac7fc --- /dev/null +++ b/samples/test_data_and_orchestrator/resources/serverless/run_4_load_job.yml @@ -0,0 +1,104 @@ +resources: + jobs: + lakeflow_samples_day_4_load: + name: Lakeflow Framework - Run 4 - Load (${var.logical_env}) + tasks: + - task_key: staging_load + notebook_task: + notebook_path: ${workspace.file_path}/src/run_4_staging_load + base_parameters: + catalog: ${var.catalog} + logical_env: ${var.logical_env} + + # Bronze pipelines + - task_key: bronze_base_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_base_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_general + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_general_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_data_quality + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_data_quality_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_snapshots + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_snapshots_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: feature_samples_pipeline_table_migration + pipeline_task: + pipeline_id: ${var.lakeflow_samples_feature_samples_pipeline_table_migration_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: yaml_sample_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_yaml_sample_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + - task_key: bronze_template_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_bronze_template_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: staging_load + + # Silver pipelines + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_basic_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_basic_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_multi_source_streaming_decomp_final_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_final_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + - task_key: silver_multi_source_streaming_decomp_staging_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_multi_source_streaming_decomp_staging_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_base_samples_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_base_samples_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + - task_key: silver_stream_static_streaming_dwh_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_silver_stream_static_streaming_dwh_pipeline_id} + full_refresh: false + depends_on: + - task_key: bronze_base_pipeline + + # Gold pipelines + - task_key: gold_stream_static_pipeline + pipeline_task: + pipeline_id: ${var.lakeflow_samples_gold_stream_static_pipeline_id} + full_refresh: false + depends_on: + - task_key: silver_base_samples_pipeline + + queue: + enabled: true \ No newline at end of file diff --git a/samples/test_data_and_orchestrator/scratch/README.md b/samples/test_data_and_orchestrator/scratch/README.md new file mode 100644 index 0000000..e6cfb81 --- /dev/null +++ b/samples/test_data_and_orchestrator/scratch/README.md @@ -0,0 +1,4 @@ +# scratch + +This folder is reserved for personal, exploratory notebooks. +By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/samples/test_data_and_orchestrator/src/create_schemas_and_tables.ipynb b/samples/test_data_and_orchestrator/src/create_schemas_and_tables.ipynb new file mode 100644 index 0000000..03cb7f7 --- /dev/null +++ b/samples/test_data_and_orchestrator/src/create_schemas_and_tables.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Create Schemas and Tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%run \"./initialize\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {staging_schema}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {bronze_schema}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {bronze_schema}_yaml\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {silver_schema}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {gold_schema}\")\n", + "spark.sql(f\"CREATE VOLUME IF NOT EXISTS {staging_schema}.{staging_volume}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {dpm_schema}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Base Tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"DROP TABLE IF EXISTS {staging_schema}.customer\")\n", + "spark.sql(f\"\"\"CREATE TABLE {staging_schema}.customer (\n", + " CUSTOMER_ID integer,\n", + " FIRST_NAME string,\n", + " LAST_NAME string,\n", + " EMAIL string,\n", + " DELETE_FLAG boolean,\n", + " LOAD_TIMESTAMP timestamp)\n", + "TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")\n", + "\n", + "spark.sql(f\"DROP TABLE IF EXISTS {staging_schema}.customer_address\")\n", + "spark.sql(f\"\"\"CREATE TABLE {staging_schema}.customer_address (\n", + " CUSTOMER_ID integer,\n", + " CITY string,\n", + " STATE string,\n", + " LOAD_TIMESTAMP timestamp)\n", + "TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"DROP TABLE IF EXISTS {staging_schema}.customer_snapshot_source\")\n", + "spark.sql(f\"\"\"CREATE TABLE {staging_schema}.customer_snapshot_source (\n", + " CUSTOMER_ID integer,\n", + " FIRST_NAME string,\n", + " LAST_NAME string,\n", + " EMAIL string,\n", + " DELETE_FLAG boolean,\n", + " LOAD_TIMESTAMP timestamp)\n", + "TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")\n", + "\n", + "spark.sql(f\"DROP TABLE IF EXISTS {staging_schema}.customer_historical_snapshot_source\")\n", + "spark.sql(f\"\"\"CREATE TABLE {staging_schema}.customer_historical_snapshot_source (\n", + " CUSTOMER_ID integer,\n", + " FIRST_NAME string,\n", + " LAST_NAME string,\n", + " EMAIL string,\n", + " LOAD_TIMESTAMP timestamp)\n", + "TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")\n", + "\n", + "spark.sql(f\"DROP TABLE IF EXISTS {staging_schema}.customer_snapshots\")\n", + "spark.sql(f\"\"\"CREATE TABLE {staging_schema}.customer_snapshots (\n", + " CUSTOMER_ID integer,\n", + " FIRST_NAME string,\n", + " LAST_NAME string,\n", + " EMAIL string,\n", + " UPDATE_TIMESTAMP timestamp,\n", + " SNAPSHOT_TIMESTAMP timestamp,\n", + " SNAPSHOT_VERSION integer)\n", + "TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")\n", + "\n", + "spark.sql(f\"DROP TABLE IF EXISTS {staging_schema}.customer_purchase\")\n", + "spark.sql(f\"\"\"CREATE TABLE {staging_schema}.customer_purchase (\n", + " CUSTOMER_ID integer,\n", + " PRODUCT string,\n", + " QUANTITY integer,\n", + " PRICE decimal(10, 2),\n", + " PURCHASE_TIMESTAMP timestamp)\n", + "TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")\n", + "\n", + "spark.sql(f\"DROP TABLE IF EXISTS {bronze_schema}.table_to_migrate_scd0\")\n", + "spark.sql(f\"\"\"CREATE TABLE {bronze_schema}.table_to_migrate_scd0 ( \n", + " CUSTOMER_ID integer,\n", + " FIRST_NAME string,\n", + " LAST_NAME string,\n", + " EMAIL string\n", + ") TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")\n", + "\n", + "spark.sql(f\"DROP TABLE IF EXISTS {bronze_schema}.table_to_migrate_scd2\")\n", + "spark.sql(f\"\"\"CREATE TABLE {bronze_schema}.table_to_migrate_scd2 (\n", + " CUSTOMER_ID integer,\n", + " FIRST_NAME string,\n", + " LAST_NAME string,\n", + " EMAIL string,\n", + " EFFECTIVE_FROM timestamp,\n", + " EFFECTIVE_TO timestamp)\n", + "TBLPROPERTIES (delta.enableChangeDataFeed = true);\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Kafka Tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"DROP TABLE IF EXISTS {staging_schema}.kafka_sink_sample_source\")\n", + "spark.sql(f\"\"\"CREATE TABLE {staging_schema}.kafka_sink_sample_source (\n", + " Message_Id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1),\n", + " Message_Ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n", + " Message_payload STRING\n", + ")\n", + "USING delta\n", + "TBLPROPERTIES (\n", + " 'delta.enableChangeDataFeed' = 'true',\n", + " 'delta.feature.allowColumnDefaults' = 'supported',\n", + " 'delta.feature.changeDataFeed' = 'supported',\n", + " 'delta.feature.columnMapping' = 'supported',\n", + " 'delta.feature.generatedColumns' = 'supported',\n", + " 'delta.feature.invariants' = 'supported',\n", + " 'delta.minReaderVersion' = '3',\n", + " 'delta.minWriterVersion' = '7'\n", + ")\"\"\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/test_data_and_orchestrator/src/initialize.ipynb b/samples/test_data_and_orchestrator/src/initialize.ipynb new file mode 100644 index 0000000..9d7b47f --- /dev/null +++ b/samples/test_data_and_orchestrator/src/initialize.ipynb @@ -0,0 +1,73 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Initialize Variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"catalog\", \"main\")\n", + "dbutils.widgets.text(\"schema_namespace\", \"lakeflow_samples\")\n", + "dbutils.widgets.text(\"logical_env\", \"\")\n", + "\n", + "\n", + "catalog = dbutils.widgets.get(\"catalog\")\n", + "schema_namespace = dbutils.widgets.get(\"schema_namespace\")\n", + "logical_env = dbutils.widgets.get(\"logical_env\")\n", + "\n", + "staging_schema = f'{catalog}.{schema_namespace}_staging{logical_env}'\n", + "bronze_schema = f'{catalog}.{schema_namespace}_bronze{logical_env}'\n", + "silver_schema = f'{catalog}.{schema_namespace}_silver{logical_env}'\n", + "gold_schema = f'{catalog}.{schema_namespace}_gold{logical_env}'\n", + "dpm_schema = f'{catalog}.{schema_namespace}_dpm{logical_env}'\n", + "staging_volume = \"stg_volume\"\n", + "\n", + "volume_root_file_path = f\"/Volumes/{staging_schema}/{staging_volume}\".replace(\".\", \"/\")\n", + "customer_file_path = f\"{volume_root_file_path}/customer\"\n", + "customer_snapshot_file_path = f\"{volume_root_file_path}/snapshot_customer\"\n", + "customer_snapshot_partitioned_file_path = f\"{volume_root_file_path}/snapshot_customer_partitioned\"\n", + "customer_snapshot_partitioned_parquet_file_path = f\"{volume_root_file_path}/snapshot_customer_partitioned_parquet\"\n", + "template_samples_base_file_path = f\"{volume_root_file_path}/template_samples\"\n", + "template_samples_customer_file_path = f\"{template_samples_base_file_path}/snapshot_customer\"\n", + "template_samples_customer_address_file_path = f\"{template_samples_base_file_path}/snapshot_customer_address\"\n" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/test_data_and_orchestrator/src/run_1_staging_load.ipynb b/samples/test_data_and_orchestrator/src/run_1_staging_load.ipynb new file mode 100644 index 0000000..954679d --- /dev/null +++ b/samples/test_data_and_orchestrator/src/run_1_staging_load.ipynb @@ -0,0 +1,496 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Day 1 Load" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%run \"./initialize\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Base Tables and Sources" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Base Data Load\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " DELETE_FLAG,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'john.doe@example.com', NULL, '2023-01-01 10:00:00')\n", + " , (2, 'Jane', 'Smith', 'jane.smith@example.com', NULL, '2023-01-01 10:00:00')\n", + " , (10, 'Richard', 'Johnson', 'richard.johnson@example.com', NULL, '2023-01-01 10:00:00')\"\"\")\n", + "\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_address (\n", + " CUSTOMER_ID,\n", + " CITY,\n", + " STATE,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'Melbourne', 'VIC', '2023-01-01 10:00:00')\n", + " , (2, 'Melbourne', 'VIC', '2023-01-01 10:00:00')\n", + " , (NULL, 'Melbourne', 'VIC', '2023-01-01 10:00:00')\n", + " , (4, 'Hobart', 'TAS', '2023-01-01 10:00:00')\n", + " , (10, 'Sydney', 'NSW', '2023-01-01 10:00:00')\"\"\")\n", + "\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_purchase (\n", + " CUSTOMER_ID,\n", + " PRODUCT,\n", + " QUANTITY,\n", + " PRICE,\n", + " PURCHASE_TIMESTAMP)\n", + "VALUES\n", + " (1, 'Apples', 1, 10.00, '2023-01-01 10:00:00')\n", + " , (1, 'Bananas', 2, 20.00, '2023-01-01 10:00:00')\n", + " , (2, 'Oranges', 3, 30.00, '2023-01-01 10:00:00')\n", + " , (2, 'Pears', 4, 40.00, '2023-01-01 10:00:00')\n", + " , (10, 'Apples', 5, 50.00, '2023-01-01 10:00:00')\n", + " , (10, 'Bananas', 6, 60.00, '2023-01-01 10:00:00')\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Base File Load\n", + "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n", + "1,John,Doe,john.doe@example.com,,2023-01-01 10:00:00\\n\n", + "2,Jane,Smith,jane.smith@example.com,,2023-01-01 10:00:00\\n\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_file_path}/customer_1.csv\",\n", + " file_content,\n", + " True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Tables and Sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Snapshot Sources\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_snapshot_source (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " DELETE_FLAG,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'john.doe@example.com', NULL, '2023-01-01 10:00:00')\n", + " , (2, 'Jane', 'Smith', 'jane.smith@example.com', NULL, '2023-01-01 10:00:00')\n", + " , (10, 'Richard', 'Johnson', 'richard.johnson@example.com', NULL, '2023-01-01 10:00:00')\"\"\")\n", + "\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_historical_snapshot_source (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'john.doe@example.com', '2024-01-01 10:00:00')\n", + " , (2, 'Jane', 'Smith', 'jane.smith@example.com', '2024-01-01 10:00:00')\n", + " , (1, 'John', 'Doe', 'jdoe@example.com', '2024-01-04 10:00:00')\n", + " , (2, 'Jane', 'Smith', 'jane.smith@example.com', '2024-01-04 10:00:00')\n", + " , (3, 'Alice', 'Green', 'alice.green@example.com', '2024-01-04 10:00:00')\n", + " , (4, 'Joe', 'Bloggs', 'joe.bloggs@example.com', '2023-01-04 10:00:00')\n", + " , (1, 'John', 'Doe', 'jdoe@example.com', '2024-02-10 10:00:00')\n", + " , (2, 'Jane', 'Smith', 'jane.smith@example.com', '2024-02-10 10:00:00')\n", + " , (3, 'Alice', 'Green', 'alice.green@example.com', '2024-02-10 10:00:00')\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Historical table snapshots\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_snapshots (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " UPDATE_TIMESTAMP,\n", + " SNAPSHOT_TIMESTAMP,\n", + " SNAPSHOT_VERSION)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'john.doe@example.com', '2023-01-01 00:00:00', '2023-01-01 00:00:00',0)\n", + " , (2, 'Jane', 'Smith', 'jane.smith@example.com', '2023-01-01 00:00:00', '2023-01-01 00:00:00',0)\n", + " , (10, 'Richard', 'Johnson', 'richard.johnson@example.com', '2023-01-01 00:00:00','2023-01-01 00:00:00', 0)\"\"\")\n", + "\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_snapshots (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " UPDATE_TIMESTAMP,\n", + " SNAPSHOT_TIMESTAMP,\n", + " SNAPSHOT_VERSION)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'jdoe@example.com', '2023-01-02 00:00:00','2023-01-02 10:00:00', 1)\n", + " , (3, 'Alice', 'Green', 'alice.green@example.com', '2023-01-02 00:00:00','2023-01-02 10:00:00', 1)\n", + " , (4, 'Joe', 'Bloggs', 'joe.bloggs@example.com', '2023-01-02 00:00:00','2023-01-02 10:00:00', 1)\n", + " , (10, 'Richard', 'Johnson', 'richard.johnson@example.com', '2023-01-01 00:00:00','2023-01-02 10:00:00', 1);\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Delete all files and directories in snapshot directories\n", + "dbutils.fs.rm(customer_snapshot_file_path, True)\n", + "dbutils.fs.rm(customer_snapshot_partitioned_file_path, True)\n", + "dbutils.fs.rm(customer_snapshot_partitioned_parquet_file_path, True)\n", + "dbutils.fs.rm(template_samples_customer_file_path, True)\n", + "dbutils.fs.rm(template_samples_customer_address_file_path, True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Historical Snapshot File Sources\n", + "\n", + "\n", + "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n", + "1,John,Doe,john.doe@example.com,,2024-01-01 10:00:00\\n\n", + "2,Jane,Smith,jane.smith@example.com,,2024-01-01 10:00:00\\n\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_file_path}/customer_2024_01_01.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_partitioned_file_path}/YEAR=2024/MONTH=01/DAY=01/customer.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_file_path}/customer_2024_01_01.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n", + "1,John,Doe,jdoe@example.com,,2024-01-04 10:00:00\\n\n", + "3,Alice,Green,alice.green@example.com,,2024-01-04 10:00:00\\n\n", + "4,Joe,Bloggs,joe.bloggs@example.com,,2024-01-04 10:00:00\\n\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_file_path}/customer_2024_01_04.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_file_path}/sub_dir_test/customer_2024_01_04.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_partitioned_file_path}/YEAR=2024/MONTH=01/DAY=04/customer.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_file_path}/customer_2024_01_04.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "\n", + "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n", + "1,John,Doe,jdoe@example.com,,2024-02-10 10:00:00\\n\n", + "3,Alice,Green,alice.green@example.com,,2024-02-10 10:00:00\\n\n", + "4,Joe,Bloggs,joe.bloggs@example.com,,2024-02-10 10:00:00\\n\n", + "5,Sarah,Jones,sarah.jones@example.com,,2024-02-10 10:00:00\\n\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_file_path}/customer_2024_02_10.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_partitioned_file_path}/YEAR=2024/MONTH=02/DAY=10/customer.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_file_path}/customer_2024_02_10.csv\",\n", + " file_content,\n", + " True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Historical Snapshot File Sources for customer_address\n", + "\n", + "customer_address_file_content = \"\"\"CUSTOMER_ID,CITY,STATE,LOAD_TIMESTAMP\n", + "1,Melbourne,VIC,2024-01-01 10:00:00\n", + "2,Melbourne,VIC,2024-01-01 10:00:00\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_address_file_path}/customer_address_2024_01_01.csv\",\n", + " customer_address_file_content,\n", + " True\n", + ")\n", + "\n", + "customer_address_file_content = \"\"\"CUSTOMER_ID,CITY,STATE,LOAD_TIMESTAMP\n", + "1,Melbourne,VIC,2024-01-04 10:00:00\n", + "4,Hobart,TAS,2024-01-04 10:00:00\n", + "10,Sydney,NSW,2024-01-04 10:00:00\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_address_file_path}/customer_address_2024_01_04.csv\",\n", + " customer_address_file_content,\n", + " True\n", + ")\n", + "\n", + "customer_address_file_content = \"\"\"CUSTOMER_ID,CITY,STATE,LOAD_TIMESTAMP\n", + "1,Sydney,NSW,2024-02-10 10:00:00\n", + "4,Hobart,TAS,2024-02-10 10:00:00\n", + "10,Brisbane,QLD,2024-02-10 10:00:00\n", + "12,Perth,WA,2024-02-10 10:00:00\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_address_file_path}/customer_address_2024_02_10.csv\",\n", + " customer_address_file_content,\n", + " True\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType\n", + "\n", + "# Historical Snapshot Parquet Sources \n", + "schema = StructType([\n", + " StructField(\"CUSTOMER_ID\", StringType(), True),\n", + " StructField(\"FIRST_NAME\", StringType(), True),\n", + " StructField(\"LAST_NAME\", StringType(), True),\n", + " StructField(\"EMAIL\", StringType(), True),\n", + " StructField(\"DELETE_FLAG\", StringType(), True),\n", + " StructField(\"LOAD_TIMESTAMP\", StringType(), True)\n", + "])\n", + "\n", + "data = [\n", + " [\"1\", \"John\", \"Doe\", \"john.doe@example.com\", \"\", \"2024-01-01 10:00:00\"],\n", + " [\"2\", \"Jane\", \"Smith\", \"jane.smith@example.com\", \"\", \"2024-01-01 10:00:00\"]\n", + "]\n", + "\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "df.write.parquet(\n", + " f\"{customer_snapshot_partitioned_parquet_file_path}/YEAR=2024/MONTH=01/DAY=01/customer.parquet\",\n", + " mode=\"overwrite\"\n", + ")\n", + "\n", + "\n", + "data = [\n", + " [\"1\", \"John\", \"Doe\", \"jdoe@example.com\", \"\", \"2024-01-04 10:00:00\"],\n", + " [\"3\", \"Alice\", \"Green\", \"alice.green@example.com\", \"\", \"2024-01-04 10:00:00\"], \n", + " [\"4\", \"Joe\", \"Bloggs\", \"joe.bloggs@example.com\", \"\", \"2024-01-04 10:00:00\"]\n", + "]\n", + "\n", + "df = spark.createDataFrame(data, schema=schema) \n", + "\n", + "df.write.parquet(\n", + " f\"{customer_snapshot_partitioned_parquet_file_path}/YEAR=2024/MONTH=01/DAY=04/customer.parquet\",\n", + " mode=\"overwrite\"\n", + ")\n", + "\n", + "\n", + "data = [\n", + " [\"1\", \"John\", \"Doe\", \"jdoe@example.com\", \"\", \"2024-02-10 10:00:00\"],\n", + " [\"3\", \"Alice\", \"Green\", \"alice.green@example.com\", \"\", \"2024-02-10 10:00:00\"], \n", + " [\"4\", \"Joe\", \"Bloggs\", \"joe.bloggs@example.com\", \"\", \"2024-02-10 10:00:00\"],\n", + " [\"5\", \"Sarah\", \"Jones\", \"sarah.jones@example.com\", \"\", \"2024-02-10 10:00:00\"]\n", + "]\n", + "\n", + "\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "df.write.parquet(\n", + " f\"{customer_snapshot_partitioned_parquet_file_path}/YEAR=2024/MONTH=02/DAY=10/customer.parquet\",\n", + " mode=\"overwrite\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Kafka Sink Sample Source Data Load\n", + "message_payload = '{\"Mortgage_id\": \"123\", \"Mortgage_fac\": \"M1-str\", \"Mortgage_score\": 30}'\n", + "spark.sql(f\"INSERT INTO {staging_schema}.kafka_sink_sample_source (Message_payload) VALUES ('{message_payload}')\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Table Migration Source\n", + "# Simulate a table ready for migration into a pipleline.\n", + "# This simulates a table being populated outside of SDP by classic pyspark jobs, from the same sources.\n", + "# Read the documentation on Table Migration for more details.\n", + "\n", + "# Simulates Day 1 data load.\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {bronze_schema}.table_to_migrate_scd2 (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " EFFECTIVE_FROM,\n", + " EFFECTIVE_TO)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'john.doe@example.com', '2023-01-01 10:00:00', NULL),\n", + " (2, 'Jane', 'Smith', 'jane.smith@example.com', '2023-01-01 10:00:00', NULL),\n", + " (10, 'Richard', 'Johnson', 'richard.johnson@example.com', '2023-01-01 10:00:00', NULL),\n", + " (30, 'Closed Same Day', 'Customer', 'closed_same_day.customer@example.com', '2023-01-01 10:00:00', '2023-01-01 10:00:00'),\n", + " (40, 'Closed Normal', 'Customer', 'cnormal.customer@example.com', '2023-01-02 10:00:00', '2023-05-01 10:00:00'),\n", + " (40, 'Closed Normal', 'Customer', 'closed_normal.customer@example.com', '2023-01-01 10:00:00', '2023-01-02 10:00:00')\"\"\")\n", + "\n", + "# Simulates Day 1 data load.\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {bronze_schema}.table_to_migrate_scd0 (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'john.doe@example.com'),\n", + " (2, 'Jane', 'Smith', 'jane.smith@example.com'),\n", + " (10, 'Richard', 'Johnson', 'richard.johnson@example.com')\"\"\")\n", + "\n", + "# Simulates Day 2 data load.\n", + "# spark.sql(f\"\"\"\n", + "# MERGE INTO {staging_schema}.table_to_migrate AS t\n", + "# USING (\n", + "# SELECT\n", + "# merge_key,\n", + "# id,\n", + "# first_name,\n", + "# last_name,\n", + "# email,\n", + "# delete_flag,\n", + "# load_timestamp\n", + "# FROM VALUES\n", + "# (NULL, 1, 'John', 'Doe', 'jdoe@example.com', NULL, TIMESTAMP('2023-01-02 10:00:00')),\n", + "# (1, 1, 'John', 'Doe', 'jdoe@example.com', NULL, TIMESTAMP('2023-01-02 10:00:00')),\n", + "# (3, 3, 'Alice', 'Green', 'alice.green@example.com', NULL, TIMESTAMP('2023-01-02 10:00:00')),\n", + "# (4, 4, 'Joe', 'Bloggs', 'joe.bloggs@example.com', NULL, TIMESTAMP('2023-01-02 10:00:00')),\n", + "# (10, 10,'Richard', 'Johnson','richard.johnson@example.com', 1, TIMESTAMP('2023-01-02 10:00:00'))\n", + "# AS t(merge_key, id, first_name, last_name, email, delete_flag, load_timestamp)\n", + "# ) AS src\n", + "# ON t.customer_id = src.merge_key and t.effective_to IS NULL\n", + "# WHEN MATCHED AND (\n", + "# t.first_name <> src.first_name\n", + "# OR t.last_name <> src.last_name\n", + "# OR t.email <> src.email\n", + "# OR src.delete_flag = 1\n", + "# ) THEN\n", + "# UPDATE SET t.effective_to = src.load_timestamp\n", + "\n", + "# WHEN NOT MATCHED THEN\n", + "# INSERT (customer_id, first_name, last_name, email, effective_from, effective_to)\n", + "# VALUES (src.id, src.first_name, src.last_name, src.email, src.load_timestamp, NULL);\n", + "# \"\"\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/test_data_and_orchestrator/src/run_2_staging_load.ipynb b/samples/test_data_and_orchestrator/src/run_2_staging_load.ipynb new file mode 100644 index 0000000..d4071b6 --- /dev/null +++ b/samples/test_data_and_orchestrator/src/run_2_staging_load.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Run 2 Load" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%run \"./initialize\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " DELETE_FLAG,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'jdoe@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (3, 'Alice', 'Green', 'alice.green@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (4, 'Joe', 'Bloggs', 'joe.bloggs@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (10, 'Richard', 'Johnson', 'richard.johnson@example.com', True, '2023-01-02 10:00:00');\"\"\")\n", + "\n", + "spark.sql(f\"\"\"INSERT OVERWRITE TABLE {staging_schema}.customer_snapshot_source (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " DELETE_FLAG,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'jdoe@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (2, 'Jane', 'Smith', 'jane.smith@example.com', NULL, '2023-01-01 10:00:00')\n", + " , (3, 'Alice', 'Green', 'alice.green@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (4, 'Joe', 'Bloggs', 'joe.bloggs@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (10, 'Richard', 'Johnson', 'richard.johnson@example.com', NULL, '2023-01-01 10:00:00')\"\"\")\n", + "\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_address (\n", + " CUSTOMER_ID,\n", + " CITY,\n", + " STATE,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (2, 'Perth', 'WA', '2023-01-02 10:00:00')\n", + " , (3, 'Sydney', 'NSW', '2023-01-02 10:00:00')\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_historical_snapshot_source (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (6, 'Someone', 'else', 'someone.else@example.com', '2024-04-01 10:00:00')\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n", + "6,Someone,else,someone.else@example.com,,2023-03-01 10:00:00\\n\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_file_path}/customer_2024_03_01.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_snapshot_partitioned_file_path}/YEAR=2024/MONTH=03/DAY=01/customer.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_file_path}/customer_2024_03_01.csv\",\n", + " file_content,\n", + " True)\n", + "\n", + "# Template samples customer_address data for run 2\n", + "customer_address_file_content = \"\"\"CUSTOMER_ID,CITY,STATE,LOAD_TIMESTAMP\n", + "2,Perth,WA,2024-03-01 10:00:00\n", + "3,Sydney,NSW,2024-03-01 10:00:00\n", + "6,Adelaide,SA,2024-03-01 10:00:00\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{template_samples_customer_address_file_path}/customer_address_2024_03_01.csv\",\n", + " customer_address_file_content,\n", + " True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.types import StructType, StructField, StringType\n", + "\n", + "schema = StructType([\n", + " StructField(\"CUSTOMER_ID\", StringType(), True),\n", + " StructField(\"FIRST_NAME\", StringType(), True),\n", + " StructField(\"LAST_NAME\", StringType(), True),\n", + " StructField(\"EMAIL\", StringType(), True),\n", + " StructField(\"DELETE_FLAG\", StringType(), True),\n", + " StructField(\"LOAD_TIMESTAMP\", StringType(), True)\n", + "])\n", + "\n", + "data = [\n", + " [\"6\", \"Someone\", \"else\", \"someone.else@example.com\", \"\", \"2023-03-01 10:00:00\"]\n", + "]\n", + "\n", + "df = spark.createDataFrame(data, schema=schema)\n", + "\n", + "df.write.parquet(\n", + " f\"{customer_snapshot_partitioned_parquet_file_path}/YEAR=2024/MONTH=03/DAY=01/customer.parquet\",\n", + " mode=\"overwrite\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_content = \"\"\"CUSTOMER_ID,FIRST_NAME,LAST_NAME,EMAIL,DELETE_FLAG,LOAD_TIMESTAMP\\n\n", + "1,John,Doe,jdoe@example.com,,2023-01-02 10:00:00\\n\n", + "3,Alice,Green,alice.green@example.com,,2023-01-02 10:00:00\\n\n", + "4,Joe,Bloggs,joe.bloggs@example.com,,2023-01-02 10:00:00\\n\n", + "\"\"\"\n", + "\n", + "dbutils.fs.put(\n", + " f\"{customer_file_path}/customer_2.csv\",\n", + " file_content,\n", + " True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_snapshots (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " UPDATE_TIMESTAMP,\n", + " SNAPSHOT_TIMESTAMP,\n", + " SNAPSHOT_VERSION)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'jdoe@example.com', '2023-01-02 00:00:00','2023-01-03T00:00:00', 2)\n", + " , (3, 'Alice', 'Green', 'alice.green@example.com', '2023-01-02 00:00:00', '2023-01-03T00:00:00', 2)\n", + " , (4, 'Joe', 'Bloggs', 'j.bloggs@example.com', '2023-01-03 00:00:00', '2023-01-03T00:00:00', 2) \n", + " \"\"\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/test_data_and_orchestrator/src/run_3_staging_load.ipynb b/samples/test_data_and_orchestrator/src/run_3_staging_load.ipynb new file mode 100644 index 0000000..2878ed2 --- /dev/null +++ b/samples/test_data_and_orchestrator/src/run_3_staging_load.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Run 3 Load" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%run \"./initialize\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "spark.sql(f\"\"\"INSERT OVERWRITE TABLE {staging_schema}.customer_snapshot_source (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " DELETE_FLAG,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (2, 'Jane', 'Smith', 'jane.smith@example.com', NULL, '2023-01-01 10:00:00')\n", + " , (3, 'Alice', 'Green', 'alice.green@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (4, 'Joe', 'Bloggs', 'joe.bloggs@example.com', NULL, '2023-01-02 10:00:00')\n", + " , (10, 'Richard', 'Johnson', 'richard.johnson@example.com', NULL, '2023-01-01 10:00:00')\"\"\")\n", + "\n", + "\n", + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer_address (\n", + " CUSTOMER_ID,\n", + " CITY,\n", + " STATE,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'Brisbane', 'QLD', '2023-01-03 10:00:00')\"\"\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/test_data_and_orchestrator/src/run_4_staging_load.ipynb b/samples/test_data_and_orchestrator/src/run_4_staging_load.ipynb new file mode 100644 index 0000000..06fca47 --- /dev/null +++ b/samples/test_data_and_orchestrator/src/run_4_staging_load.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Run 3 Load" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%run \"./initialize\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "spark.sql(f\"\"\"INSERT INTO TABLE {staging_schema}.customer (\n", + " CUSTOMER_ID,\n", + " FIRST_NAME,\n", + " LAST_NAME,\n", + " EMAIL,\n", + " DELETE_FLAG,\n", + " LOAD_TIMESTAMP)\n", + "VALUES\n", + " (1, 'John', 'Doe', 'john.doe@another.example.com', False, '2023-01-04 10:00:00')\"\"\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/test_data_and_orchestrator/src/test_kakfa_sink.ipynb b/samples/test_data_and_orchestrator/src/test_kakfa_sink.ipynb new file mode 100644 index 0000000..b5039c2 --- /dev/null +++ b/samples/test_data_and_orchestrator/src/test_kakfa_sink.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Read from Kafka" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%run \"./initialize\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dbutils.widgets.text(\"kafka_bootstrap_servers_tls\", \"b-1.oneenvkafka.fso631.c14.kafka.us-west-2.amazonaws.com:9094,b-2.oneenvkafka.fso631.c14.kafka.us-west-2.amazonaws.com:9094,b-3.oneenvkafka.fso631.c14.kafka.us-west-2.amazonaws.com:9094\")\n", + "dbutils.widgets.text(\"topic\", f\"contract{logical_env}\")\n", + "\n", + "kafka_bootstrap_servers_tls = dbutils.widgets.get(\"kafka_bootstrap_servers_tls\")\n", + "topic = dbutils.widgets.get(\"topic\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import col, from_json\n", + "\n", + "startingOffsets = \"earliest\"\n", + "kafka = (spark.readStream\n", + " .format(\"kafka\")\n", + " .option(\"kafka.bootstrap.servers\", kafka_bootstrap_servers_tls) \n", + " .option(\"kafka.security.protocol\", \"SSL\") \n", + " .option(\"subscribe\", topic)\n", + " .option(\"startingOffsets\", startingOffsets)\n", + " .load())\n", + "\n", + "display(kafka.select(col(\"key\").cast(\"string\"), col(\"value\").cast(\"string\")))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/tpch_sample/.gitignore b/samples/tpch_sample/.gitignore new file mode 100644 index 0000000..bc4bd13 --- /dev/null +++ b/samples/tpch_sample/.gitignore @@ -0,0 +1,7 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/ diff --git a/samples/tpch_sample/.vscode/settings.json b/samples/tpch_sample/.vscode/settings.json new file mode 100644 index 0000000..1a79a81 --- /dev/null +++ b/samples/tpch_sample/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------" +} \ No newline at end of file diff --git a/samples/tpch_sample/databricks.yml b/samples/tpch_sample/databricks.yml new file mode 100644 index 0000000..4421a9a --- /dev/null +++ b/samples/tpch_sample/databricks.yml @@ -0,0 +1,48 @@ +# This is a Databricks asset bundle definition for bronze_sample. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: tpch_sample + +include: + - scratch/resources/*.yml + +variables: + catalog: + description: The target UC catalog + framework_source_path: + description: The full workspace path to the framwework src folder + schema_namespace: + description: The prefix for all schemas which will end with one of (_bronze, _silver, _gold, or _dpm) + default: "lakeflow_samples" + workspace_host: + description: workspace url used for API calls from Framework (usually same as deployment URL) e.g. https://e2-demo-field-eng.cloud.databricks.com/ + layer: + description: The target layer + default: bronze + logical_env: + description: The logical environment + default: "" + pipeline_cluster_config: + description: Basic cluster config, add node types as necessary + default: + label: default + autoscale: + min_workers: 1 + max_workers: 5 + mode: ENHANCED + job_cluster_config: + description: Complete cluster configuration for compute clusters + default: + spark_version: "16.4.x-scala2.12" + node_type_id: "i3.xlarge" + num_workers: 1 + autoscale: + min_workers: 1 + max_workers: 5 + mode: ENHANCED + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + mode: development + default: true diff --git a/samples/tpch_sample/fixtures/.gitkeep b/samples/tpch_sample/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/samples/tpch_sample/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/samples/tpch_sample/pytest.ini b/samples/tpch_sample/pytest.ini new file mode 100644 index 0000000..80432c2 --- /dev/null +++ b/samples/tpch_sample/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = src diff --git a/samples/tpch_sample/resources/classic/bronze_pipeline.yml b/samples/tpch_sample/resources/classic/bronze_pipeline.yml new file mode 100644 index 0000000..5e1f822 --- /dev/null +++ b/samples/tpch_sample/resources/classic/bronze_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_tpch_bronze_pipeline: + name: Lakeflow Framework - TPCH - Bronze Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + target: ${var.bronze_schema}${var.logical_env} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: bronze + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: tpch_bronze \ No newline at end of file diff --git a/samples/tpch_sample/resources/classic/gold_pipeline.yml b/samples/tpch_sample/resources/classic/gold_pipeline.yml new file mode 100644 index 0000000..fc7ac73 --- /dev/null +++ b/samples/tpch_sample/resources/classic/gold_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_tpch_gold_pipeline: + name: Lakeflow Framework - TPCH - Gold Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + target: ${var.gold_schema}${var.logical_env} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: gold + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: tpch_gold \ No newline at end of file diff --git a/samples/tpch_sample/resources/classic/orchestrator_run_1.yml b/samples/tpch_sample/resources/classic/orchestrator_run_1.yml new file mode 100644 index 0000000..7f973e8 --- /dev/null +++ b/samples/tpch_sample/resources/classic/orchestrator_run_1.yml @@ -0,0 +1,31 @@ +resources: + jobs: + lakeflow_samples_tpch_sample_run_1_full_refresh_job: + name: Lakeflow Framework - TPCH Sample - Run 1 - Full Refresh (${var.logical_env}) + job_clusters: + - job_cluster_key: ${var.job_cluster_name} + new_cluster: ${var.job_cluster_config} + + tasks: + + # Bronze pipeline + - task_key: lakeflow_samples_tpch_bronze_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_samples_tpch_bronze_pipeline.id} + full_refresh: true + + # Silver pipeline + - task_key: lakeflow_samples_tpch_silver_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_samples_tpch_silver_pipeline.id} + full_refresh: true + depends_on: + - task_key: lakeflow_samples_tpch_bronze_pipeline + + # Gold pipeline + - task_key: lakeflow_samples_tpch_gold_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_samples_tpch_gold_pipeline.id} + full_refresh: true + depends_on: + - task_key: lakeflow_samples_tpch_silver_pipeline \ No newline at end of file diff --git a/samples/tpch_sample/resources/classic/orchestrator_setup_staging_data.yml b/samples/tpch_sample/resources/classic/orchestrator_setup_staging_data.yml new file mode 100644 index 0000000..1298665 --- /dev/null +++ b/samples/tpch_sample/resources/classic/orchestrator_setup_staging_data.yml @@ -0,0 +1,18 @@ +resources: + jobs: + lakeflow_samples_tpch_sample_setup_data_job: + name: Lakeflow Framework - TPCH Sample - Setup Data (${var.logical_env}) + job_clusters: + - job_cluster_key: ${var.job_cluster_name} + new_cluster: ${var.job_cluster_config} + + tasks: + - task_key: setup_data + job_cluster_key: ${var.job_cluster_name} + notebook_task: + notebook_path: ${workspace.file_path}/src/test_data/setup_data + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} + new_cluster: ${var.job_cluster_config} diff --git a/samples/tpch_sample/resources/classic/silver_pipeline.yml b/samples/tpch_sample/resources/classic/silver_pipeline.yml new file mode 100644 index 0000000..7fcf450 --- /dev/null +++ b/samples/tpch_sample/resources/classic/silver_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_tpch_silver_pipeline: + name: Lakeflow Framework - TPCH - Silver Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + target: ${var.silver_schema}${var.logical_env} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: silver + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: tpch_silver \ No newline at end of file diff --git a/samples/tpch_sample/resources/serverless/bronze_pipeline.yml b/samples/tpch_sample/resources/serverless/bronze_pipeline.yml new file mode 100644 index 0000000..384f345 --- /dev/null +++ b/samples/tpch_sample/resources/serverless/bronze_pipeline.yml @@ -0,0 +1,21 @@ +resources: + pipelines: + lakeflow_samples_tpch_bronze_pipeline: + name: Lakeflow Framework - TPCH - Bronze Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + target: ${var.bronze_schema}${var.logical_env} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: bronze + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: tpch_bronze + diff --git a/samples/tpch_sample/resources/serverless/gold_pipeline.yml b/samples/tpch_sample/resources/serverless/gold_pipeline.yml new file mode 100644 index 0000000..eb39a45 --- /dev/null +++ b/samples/tpch_sample/resources/serverless/gold_pipeline.yml @@ -0,0 +1,20 @@ +resources: + pipelines: + lakeflow_samples_tpch_gold_pipeline: + name: Lakeflow Framework - TPCH - Gold Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + target: ${var.gold_schema}${var.logical_env} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: gold + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: tpch_gold \ No newline at end of file diff --git a/samples/tpch_sample/resources/serverless/orchestrator_run_1.yml b/samples/tpch_sample/resources/serverless/orchestrator_run_1.yml new file mode 100644 index 0000000..4d51dd2 --- /dev/null +++ b/samples/tpch_sample/resources/serverless/orchestrator_run_1.yml @@ -0,0 +1,27 @@ +resources: + jobs: + lakeflow_samples_tpch_sample_run_1_full_refresh_job: + name: Lakeflow Framework - TPCH Sample - Run 1 - Full Refresh (${var.logical_env}) + tasks: + + # Bronze pipeline + - task_key: lakeflow_samples_tpch_bronze_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_samples_tpch_bronze_pipeline.id} + full_refresh: true + + # Silver pipeline + - task_key: lakeflow_samples_tpch_silver_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_samples_tpch_silver_pipeline.id} + full_refresh: true + depends_on: + - task_key: lakeflow_samples_tpch_bronze_pipeline + + # Gold pipeline + - task_key: lakeflow_samples_tpch_gold_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.lakeflow_samples_tpch_gold_pipeline.id} + full_refresh: true + depends_on: + - task_key: lakeflow_samples_tpch_silver_pipeline \ No newline at end of file diff --git a/samples/tpch_sample/resources/serverless/orchestrator_setup_staging_data.yml b/samples/tpch_sample/resources/serverless/orchestrator_setup_staging_data.yml new file mode 100644 index 0000000..d7b8ddf --- /dev/null +++ b/samples/tpch_sample/resources/serverless/orchestrator_setup_staging_data.yml @@ -0,0 +1,12 @@ +resources: + jobs: + lakeflow_samples_tpch_sample_setup_data_job: + name: Lakeflow Framework - TPCH Sample - Setup Data (${var.logical_env}) + tasks: + - task_key: setup_data + notebook_task: + notebook_path: ${workspace.file_path}/src/test_data/setup_data + base_parameters: + catalog: ${var.catalog} + schema_namespace: ${var.schema_namespace} + logical_env: ${var.logical_env} diff --git a/samples/tpch_sample/resources/serverless/silver_pipeline.yml b/samples/tpch_sample/resources/serverless/silver_pipeline.yml new file mode 100644 index 0000000..016591e --- /dev/null +++ b/samples/tpch_sample/resources/serverless/silver_pipeline.yml @@ -0,0 +1,20 @@ +resources: + pipelines: + lakeflow_samples_tpch_silver_pipeline: + name: Lakeflow Framework - TPCH - Silver Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + target: ${var.silver_schema}${var.logical_env} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: silver + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: tpch_silver \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_address_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_address_main.json new file mode 100644 index 0000000..5dfc6a8 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_address_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "customer_address_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_customer_address", + "sourceDetails": { + "path": "{staging_source_root}/customer_address/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/customer_address_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_address", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_address_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_main.json new file mode 100644 index 0000000..bcf6f8e --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_main.json @@ -0,0 +1,31 @@ +{ + "dataFlowId": "customer_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_customer", + "sourceDetails": { + "path": "{staging_source_root}/customer/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/customer_schema.json", + "selectExp": [ + "customer_id", + "name", + "mktseg", + "load_timestamp" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_phone_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_phone_main.json new file mode 100644 index 0000000..900fdd1 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/customer_phone_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "customer_phone_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_customer_phone", + "sourceDetails": { + "path": "{staging_source_root}/customer_phone/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/customer_phone_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_phone", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/customer_phone_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/lineitem_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/lineitem_main.json new file mode 100644 index 0000000..d1641a2 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/lineitem_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "lineitem_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_lineitem", + "sourceDetails": { + "path": "{staging_source_root}/lineitem/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/lineitem_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "lineitem", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/lineitem_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/nation_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/nation_main.json new file mode 100644 index 0000000..0743eb1 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/nation_main.json @@ -0,0 +1,31 @@ +{ + "dataFlowId": "nation_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_nation", + "sourceDetails": { + "path": "{staging_source_root}/nation/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/nation_schema.json", + "selectExp": [ + "n_nationkey", + "n_name", + "n_regionkey", + "load_timestamp" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "nation", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/nation_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/orders_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/orders_main.json new file mode 100644 index 0000000..a34be59 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/orders_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "orders_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_orders", + "sourceDetails": { + "path": "{staging_source_root}/orders/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/orders_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "orders", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/orders_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/part_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/part_main.json new file mode 100644 index 0000000..0f2df56 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/part_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "part_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_part", + "sourceDetails": { + "path": "{staging_source_root}/part/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/part_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "part", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/part_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/region_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/region_main.json new file mode 100644 index 0000000..f9563d5 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/region_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "region_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_region", + "sourceDetails": { + "path": "{staging_source_root}/region/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/region_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "region", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/region_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_address_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_address_main.json new file mode 100644 index 0000000..7b49e1a --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_address_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "supplier_address_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_supplier_address", + "sourceDetails": { + "path": "{staging_source_root}/supplier_address/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/supplier_address_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "supplier_address", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/supplier_address_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_main.json new file mode 100644 index 0000000..9c67bb1 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_main.json @@ -0,0 +1,37 @@ +{ + "dataFlowId": "supplier_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_supplier", + "sourceDetails": { + "path": "{staging_source_root}/supplier/*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/supplier_schema.json", + "selectExp": [ + "supplier_id", + "name", + "load_timestamp" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "supplier", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/supplier_schema.json" + }, + "dataQualityExpectationsEnabled": true, + "dataQualityExpectationsPath": "supplier_dqe.json", + "quarantineMode": "table", + "quarantineTargetDetails": { + "targetFormat": "delta", + "table": "supplier_quarantine" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_phone_main.json b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_phone_main.json new file mode 100644 index 0000000..dfe572b --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/dataflowspec/supplier_phone_main.json @@ -0,0 +1,25 @@ +{ + "dataFlowId": "supplier_phone_bronze", + "dataFlowGroup": "tpch_bronze", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "cloudFiles", + "sourceViewName": "v_supplier_phone", + "sourceDetails": { + "path": "{staging_source_root}/supplier_phone*.csv", + "readerOptions": { + "cloudFiles.format": "csv", + "header": "true" + }, + "schemaPath": "source/supplier_phone_schema.json" + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "supplier_phone", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "target/supplier_phone_schema.json" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/expectations/customer_address_dqe.json b/samples/tpch_sample/src/dataflows/bronze/expectations/customer_address_dqe.json new file mode 100644 index 0000000..7405996 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/expectations/customer_address_dqe.json @@ -0,0 +1,15 @@ +{ + "expect_or_drop": [ + { + "name": "PK not null", + "constraint": "CUSTOMER_ID IS NOT NULL", + "tag": "Validity" + }, + { + "name": "enabledTest", + "constraint": "CUSTOMER_ID = 1", + "tag": "Validity", + "enabled": false + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/expectations/supplier_dqe.json b/samples/tpch_sample/src/dataflows/bronze/expectations/supplier_dqe.json new file mode 100644 index 0000000..f67ab19 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/expectations/supplier_dqe.json @@ -0,0 +1,14 @@ +{ + "expect_or_drop": [ + { + "name": "PK not null", + "constraint": "supplier_id IS NOT NULL", + "tag": "Validity" + }, + { + "name": "load_timestamp not null", + "constraint": "load_timestamp IS NOT NULL", + "tag": "Validity" + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_address_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_address_schema.json new file mode 100644 index 0000000..eef89d4 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "nat_id", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_phone_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_phone_schema.json new file mode 100644 index 0000000..67c7cae --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_phone_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "ptype", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_schema.json new file mode 100644 index 0000000..fea28a3 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/customer_schema.json @@ -0,0 +1,35 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "acctbal", + "type": "decimal(18, 2)", + "nullable": false, + "metadata": {} + }, + { + "name": "mktseg", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/lineitem_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/lineitem_schema.json new file mode 100644 index 0000000..3e05ee7 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/lineitem_schema.json @@ -0,0 +1,107 @@ +{ + "type": "struct", + "fields": [ + { + "name": "l_orderkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "l_partkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "l_suppkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "l_linenumber", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "l_quantity", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_extendedprice", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_discount", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_tax", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_returnflag", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_linestatus", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_shipdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "l_commitdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "l_receiptdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "l_shipinstruct", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_shipmode", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_comment", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/nation_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/nation_schema.json new file mode 100644 index 0000000..c2ec88d --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/nation_schema.json @@ -0,0 +1,35 @@ +{ + "type": "struct", + "fields": [ + { + "name": "n_nationkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "n_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "n_regionkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "n_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/orders_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/orders_schema.json new file mode 100644 index 0000000..37ff52f --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/orders_schema.json @@ -0,0 +1,65 @@ +{ + "type": "struct", + "fields": [ + { + "name": "o_orderkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "o_custkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "o_orderstatus", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_totalprice", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "o_orderdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "o_orderpriority", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_clerk", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_shippriority", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "o_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/part_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/part_schema.json new file mode 100644 index 0000000..6b76df4 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/part_schema.json @@ -0,0 +1,65 @@ +{ + "type": "struct", + "fields": [ + { + "name": "p_partkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "p_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_mfgr", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_brand", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_type", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_size", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "p_container", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_retailprice", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "p_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/region_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/region_schema.json new file mode 100644 index 0000000..ee6a8bd --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/region_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "r_regionkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "r_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "r_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_address_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_address_schema.json new file mode 100644 index 0000000..f6d93bd --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "supplier_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "nat_id", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_phone_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_phone_schema.json new file mode 100644 index 0000000..ddc4888 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_phone_schema.json @@ -0,0 +1,23 @@ +{ + "type": "struct", + "fields": [ + { + "name": "supplier_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_schema.json new file mode 100644 index 0000000..c3d91ec --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/source/supplier_schema.json @@ -0,0 +1,35 @@ +{ + "type": "struct", + "fields": [ + { + "name": "supplier_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "acctbal", + "type": "decimal(18, 2)", + "nullable": false, + "metadata": {} + }, + { + "name": "comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_address_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_address_schema.json new file mode 100644 index 0000000..eef89d4 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "nat_id", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_phone_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_phone_schema.json new file mode 100644 index 0000000..67c7cae --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_phone_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "ptype", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_schema copy 2.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_schema copy 2.json new file mode 100644 index 0000000..b4212ee --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_schema copy 2.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "mktseg", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_schema.json new file mode 100644 index 0000000..b4212ee --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/customer_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "mktseg", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/lineitem_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/lineitem_schema.json new file mode 100644 index 0000000..3e05ee7 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/lineitem_schema.json @@ -0,0 +1,107 @@ +{ + "type": "struct", + "fields": [ + { + "name": "l_orderkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "l_partkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "l_suppkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "l_linenumber", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "l_quantity", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_extendedprice", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_discount", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_tax", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "l_returnflag", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_linestatus", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_shipdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "l_commitdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "l_receiptdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "l_shipinstruct", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_shipmode", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "l_comment", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/nation_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/nation_schema.json new file mode 100644 index 0000000..74dc100 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/nation_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "n_nationkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "n_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "n_regionkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/orders_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/orders_schema.json new file mode 100644 index 0000000..37ff52f --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/orders_schema.json @@ -0,0 +1,65 @@ +{ + "type": "struct", + "fields": [ + { + "name": "o_orderkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "o_custkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "o_orderstatus", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_totalprice", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "o_orderdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "o_orderpriority", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_clerk", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_shippriority", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "o_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/part_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/part_schema.json new file mode 100644 index 0000000..6b76df4 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/part_schema.json @@ -0,0 +1,65 @@ +{ + "type": "struct", + "fields": [ + { + "name": "p_partkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "p_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_mfgr", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_brand", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_type", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_size", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "p_container", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_retailprice", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "p_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/region_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/region_schema.json new file mode 100644 index 0000000..ee6a8bd --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/region_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "r_regionkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "r_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "r_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_address_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_address_schema.json new file mode 100644 index 0000000..f6d93bd --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "supplier_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "nat_id", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_phone_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_phone_schema.json new file mode 100644 index 0000000..ddc4888 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_phone_schema.json @@ -0,0 +1,23 @@ +{ + "type": "struct", + "fields": [ + { + "name": "supplier_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_schema.json b/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_schema.json new file mode 100644 index 0000000..4113b14 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/bronze/schemas/target/supplier_schema.json @@ -0,0 +1,23 @@ +{ + "type": "struct", + "fields": [ + { + "name": "supplier_id", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_customer_main.json b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_customer_main.json new file mode 100644 index 0000000..a2059f9 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_customer_main.json @@ -0,0 +1,138 @@ +{ + "dataFlowId": "dim_customer_gold", + "dataFlowGroup": "tpch_gold", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "dim_customer", + "schemaPath": "./dim_customer_schema.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "customer_key" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"], + "ignore_null_updates": true + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "dim_customer", + "stagingTables": { + "stg_customer_appnd_keys": { + "type": "ST" + }, + "stg_customer_dedupe_keys": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSettings": { + "keys": [ + "c_custkey" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"] + } + } + }, + "flows": { + "f_customer": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_customer_appnd_keys", + "sourceView": "v_customer" + }, + "views": { + "v_customer": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer", + "cdfEnabled": true, + "selectExp": ["c_custkey", "__START_AT"] + } + } + } + }, + "f_customer_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_customer_appnd_keys", + "sourceView": "v_customer_address" + }, + "views": { + "v_customer_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": ["c_custkey", "__START_AT"] + } + } + } + }, + "f_customer_phone": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_customer_appnd_keys", + "sourceView": "v_customer_phone" + }, + "views": { + "v_customer_phone": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "customer_phone", + "cdfEnabled": true, + "selectExp": ["c_custkey", "__START_AT"] + } + } + } + }, + "f_customer_merge_keys": { + "flowType": "merge", + "flowDetails": { + "targetTable": "stg_customer_dedupe_keys", + "sourceView": "stg_customer_appnd_keys" + } + }, + "f_customer_target": { + "flowType": "merge", + "flowDetails": { + "targetTable": "dim_customer", + "sourceView": "v_customer_transform" + }, + "views": { + "v_stg_customer_dedupe_cdf_feed": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "live", + "table": "stg_customer_dedupe_keys", + "cdfEnabled": true + } + }, + "v_customer_transform": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "dim_customer.sql" + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_location_main.json b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_location_main.json new file mode 100644 index 0000000..e9af88b --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_location_main.json @@ -0,0 +1,54 @@ +{ + "dataFlowId": "dim_location_gold", + "dataFlowGroup": "tpch_gold", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "dim_location", + "schemaPath": "./dim_location_schema.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "nation_key" + ], + "scd_type": "1", + "sequence_by": "meta_load_details.record_update_timestamp", + "ignore_null_updates": false + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "dim_location", + "flows": { + "f_location": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "dim_location", + "sourceView": "v_location_tfm" + }, + "views": { + "v_nation_cdf_feed": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "nation", + "cdfEnabled": true + } + }, + "v_location_tfm": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "dim_location.sql" + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_orders_main.json b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_orders_main.json new file mode 100644 index 0000000..977dbcf --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_orders_main.json @@ -0,0 +1,40 @@ +{ + "dataFlowId": "orders_gold", + "dataFlowGroup": "tpch_gold", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_orders", + "sourceDetails": { + "database": "{silver_schema}", + "table": "orders", + "cdfEnabled": true, + "selectExp": [ + "o_orderkey as order_key", + "o_orderstatus as order_status", + "o_orderdate as order_date", + "o_orderpriority as order_priority", + "o_shippriority as ship_priority", + "o_comment as comment", + "__START_AT" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "dim_orders", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "dim_orders_schema.json" + }, + "cdcSettings": { + "keys": [ + "order_key" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_supplier_main.json b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_supplier_main.json new file mode 100644 index 0000000..acd4b2d --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dataflowspec/dim_supplier_main.json @@ -0,0 +1,138 @@ +{ + "dataFlowId": "dim_supplier_gold", + "dataFlowGroup": "tpch_gold", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "dim_supplier", + "schemaPath": "./dim_supplier_schema.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "supplier_key" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"], + "ignore_null_updates": true + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "dim_supplier", + "stagingTables": { + "stg_supplier_appnd_keys": { + "type": "ST" + }, + "stg_supplier_dedupe_keys": { + "type": "ST", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "cdcSettings": { + "keys": [ + "s_suppkey" + ], + "scd_type": "2", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"] + } + } + }, + "flows": { + "f_supplier": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_supplier_appnd_keys", + "sourceView": "v_supplier" + }, + "views": { + "v_supplier": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "supplier", + "cdfEnabled": true, + "selectExp": ["s_suppkey", "__START_AT"] + } + } + } + }, + "f_supplier_address": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_supplier_appnd_keys", + "sourceView": "v_supplier_address" + }, + "views": { + "v_supplier_address": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "supplier_address", + "cdfEnabled": true, + "selectExp": ["s_suppkey", "__START_AT"] + } + } + } + }, + "f_supplier_phone": { + "flowType": "append_view", + "flowDetails": { + "targetTable": "stg_supplier_appnd_keys", + "sourceView": "v_supplier_phone" + }, + "views": { + "v_supplier_phone": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "supplier_phone", + "cdfEnabled": true, + "selectExp": ["s_suppkey", "__START_AT"] + } + } + } + }, + "f_supplier_merge_keys": { + "flowType": "merge", + "flowDetails": { + "targetTable": "stg_supplier_dedupe_keys", + "sourceView": "stg_supplier_appnd_keys" + } + }, + "f_supplier_target": { + "flowType": "merge", + "flowDetails": { + "targetTable": "dim_supplier", + "sourceView": "v_supplier_transform" + }, + "views": { + "v_stg_supplier_dedupe_cdf_feed": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "live", + "table": "stg_supplier_dedupe_keys", + "cdfEnabled": true + } + }, + "v_supplier_transform": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "dim_supplier.sql" + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dataflowspec/fct_orders_main.json b/samples/tpch_sample/src/dataflows/gold/dataflowspec/fct_orders_main.json new file mode 100644 index 0000000..c5d0aac --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dataflowspec/fct_orders_main.json @@ -0,0 +1,56 @@ +{ + "dataFlowId": "fct_orders_gold", + "dataFlowGroup": "tpch_gold", + "dataFlowType": "flow", + "targetFormat": "delta", + "targetDetails": { + "table": "fct_orders", + "schemaPath": "./fct_orders_schema.json", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + } + }, + "cdcSettings": { + "keys": [ + "order_key" + , "line_number" + ], + "scd_type": "1", + "sequence_by": "__START_AT", + "except_column_list": ["__START_AT"], + "ignore_null_updates": false + }, + "dataQualityExpectationsEnabled": false, + "flowGroups": [ + { + "flowGroupId": "fct_order_1", + "flows": { + "f_orders": { + "flowType": "merge", + "flowDetails": { + "targetTable": "fct_orders", + "sourceView": "v_orders_tfm" + }, + "views": { + "v_lineitem_cdf_feed": { + "mode": "stream", + "sourceType": "delta", + "sourceDetails": { + "database": "{silver_schema}", + "table": "lineitem", + "cdfEnabled": true + } + }, + "v_orders_tfm": { + "mode": "stream", + "sourceType": "sql", + "sourceDetails": { + "sqlPath": "fct_orders.sql" + } + } + } + } + } + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dataflowspec/snap_shot_facts_main.json b/samples/tpch_sample/src/dataflows/gold/dataflowspec/snap_shot_facts_main.json new file mode 100644 index 0000000..2347303 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dataflowspec/snap_shot_facts_main.json @@ -0,0 +1,13 @@ +{ + "dataFlowId": "snap_shot_facts", + "dataFlowGroup": "tpch_gold", + "dataFlowType": "materialized_view", + "materializedViews": { + "fct_orders_monthly_snapshot": { + "sqlPath": "./fct_orders_monthly_snapshot.sql", + "tableDetails": { + "schemaPath": "./fct_orders_monthly_snapshot_schema.json" + } + } + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dml/dim_customer.sql b/samples/tpch_sample/src/dataflows/gold/dml/dim_customer.sql new file mode 100644 index 0000000..77458b4 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dml/dim_customer.sql @@ -0,0 +1,13 @@ +SELECT + drv.c_custkey as customer_key, + c.c_name as customer_name, + ca.c_address as customer_address, + cp.c_phone as customer_phone, + ca.c_nationkey as nation_key, + c.c_mktsegment as market_segment, + drv.__START_AT as __START_AT +FROM + STREAM(live.v_stg_customer_dedupe_cdf_feed) AS drv + JOIN {silver_schema}.customer AS c ON drv.c_custkey = c.c_custkey + JOIN {silver_schema}.customer_address AS ca ON drv.c_custkey = ca.c_custkey + JOIN {silver_schema}.customer_phone AS cp ON drv.c_custkey = cp.c_custkey \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dml/dim_location.sql b/samples/tpch_sample/src/dataflows/gold/dml/dim_location.sql new file mode 100644 index 0000000..ffefe6f --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dml/dim_location.sql @@ -0,0 +1,7 @@ +SELECT + n.n_nationkey as nation_key, + n.n_name as nation_name, + r.r_name as region_name +FROM + STREAM(live.v_nation_cdf_feed) AS n + JOIN {silver_schema}.region AS r ON n.n_regionkey = r.r_regionkey \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dml/dim_supplier.sql b/samples/tpch_sample/src/dataflows/gold/dml/dim_supplier.sql new file mode 100644 index 0000000..f06e4b0 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dml/dim_supplier.sql @@ -0,0 +1,13 @@ +SELECT + drv.s_suppkey as supplier_key, + s.s_name as supplier_name, + sa.s_address as supplier_address, + sp.s_phone as supplier_phone, + n.n_nationkey as nation_key, + drv.__START_AT +FROM + STREAM(live.v_stg_supplier_dedupe_cdf_feed) AS drv + JOIN {silver_schema}.supplier AS s ON drv.s_suppkey = s.s_suppkey + JOIN {silver_schema}.supplier_address AS sa ON drv.s_suppkey = sa.s_suppkey + JOIN {silver_schema}.supplier_phone AS sp ON drv.s_suppkey = sp.s_suppkey + JOIN {silver_schema}.nation AS n ON sa.s_nationkey = n.n_nationkey \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/dml/fct_orders.sql b/samples/tpch_sample/src/dataflows/gold/dml/fct_orders.sql new file mode 100644 index 0000000..3b64441 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dml/fct_orders.sql @@ -0,0 +1,25 @@ +SELECT + o.o_orderkey as order_key, + l.l_linenumber as line_number, + c.c_custkey as customer_key, + l.l_partkey as part_key, + l.l_suppkey as supplier_key, + o.o_orderdate as order_date, + o.o_orderstatus as order_status, + l.l_quantity as quantity, + l.l_extendedprice as extended_price, + l.l_discount as discount, + l.l_tax as tax, + l.l_returnflag as return_flag, + l.l_linestatus as line_status, + l.l_shipdate as ship_date, + l.l_commitdate as commit_date, + l.l_receiptdate as receipt_date, + l.l_shipinstruct as ship_instruct, + l.l_shipmode as ship_mode, + l.l_comment as comment, + l.__START_AT +FROM + STREAM(live.v_lineitem_cdf_feed) AS l + JOIN {silver_schema}.orders AS o ON l.l_orderkey = o.o_orderkey + JOIN {silver_schema}.customer AS c ON o.o_custkey = c.c_custkey diff --git a/samples/tpch_sample/src/dataflows/gold/dml/fct_orders_monthly_snapshot.sql b/samples/tpch_sample/src/dataflows/gold/dml/fct_orders_monthly_snapshot.sql new file mode 100644 index 0000000..3a16670 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/dml/fct_orders_monthly_snapshot.sql @@ -0,0 +1,15 @@ +SELECT + CAST(date_trunc('month', o_orderdate) AS DATE) AS snapshot_month, + o_custkey AS custkey, + COUNT(DISTINCT o_orderkey) AS total_orders, + CAST(SUM(l_quantity) AS DECIMAL(18,2)) AS total_quantity, + CAST(SUM(l_extendedprice * (1 - l_discount)) AS DECIMAL(18,2)) AS total_sales, + CAST(AVG(l_discount) AS DECIMAL(5,4)) AS avg_discount, + CAST(MAX(l_extendedprice * (1 - l_discount)) AS DECIMAL(18,2)) AS max_order_total, + CAST(MIN(l_extendedprice * (1 - l_discount)) AS DECIMAL(18,2)) AS min_order_total +FROM + {silver_schema}.orders AS o + JOIN {silver_schema}.lineitem AS l ON o.o_orderkey = l.l_orderkey +GROUP BY + date_trunc('month', o_orderdate), + o_custkey \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/schemas/dim_customer_schema.json b/samples/tpch_sample/src/dataflows/gold/schemas/dim_customer_schema.json new file mode 100644 index 0000000..4490873 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/schemas/dim_customer_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "customer_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "customer_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "customer_address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "customer_phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "nation_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "market_segment", + "type": "string", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/schemas/dim_location_schema.json b/samples/tpch_sample/src/dataflows/gold/schemas/dim_location_schema.json new file mode 100644 index 0000000..aad4c2c --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/schemas/dim_location_schema.json @@ -0,0 +1,23 @@ +{ + "type": "struct", + "fields": [ + { + "name": "nation_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "nation_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "region_name", + "type": "string", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/schemas/dim_orders_schema.json b/samples/tpch_sample/src/dataflows/gold/schemas/dim_orders_schema.json new file mode 100644 index 0000000..b8674ba --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/schemas/dim_orders_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "order_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "order_status", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "order_date", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "order_priority", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "ship_priority", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "comment", + "type": "string", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/schemas/dim_supplier_schema.json b/samples/tpch_sample/src/dataflows/gold/schemas/dim_supplier_schema.json new file mode 100644 index 0000000..11f5498 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/schemas/dim_supplier_schema.json @@ -0,0 +1,35 @@ +{ + "type": "struct", + "fields": [ + { + "name": "supplier_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "supplier_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "supplier_address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "supplier_phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "nation_key", + "type": "long", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/gold/schemas/fct_orders_monthly_snapshot_schema.json b/samples/tpch_sample/src/dataflows/gold/schemas/fct_orders_monthly_snapshot_schema.json new file mode 100644 index 0000000..7ea047a --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/schemas/fct_orders_monthly_snapshot_schema.json @@ -0,0 +1,53 @@ +{ + "type": "struct", + "fields": [ + { + "name": "snapshot_month", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "custkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "total_orders", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "total_quantity", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "total_sales", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "avg_discount", + "type": "decimal(5,4)", + "nullable": true, + "metadata": {} + }, + { + "name": "max_order_total", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "min_order_total", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + } + ] +} diff --git a/samples/tpch_sample/src/dataflows/gold/schemas/fct_orders_schema.json b/samples/tpch_sample/src/dataflows/gold/schemas/fct_orders_schema.json new file mode 100644 index 0000000..51e0881 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/gold/schemas/fct_orders_schema.json @@ -0,0 +1,119 @@ +{ + "type": "struct", + "fields": [ + { + "name": "order_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "line_number", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "customer_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "part_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "supplier_key", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "order_date", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "order_status", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "quantity", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "extended_price", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "discount", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "tax", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "return_flag", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "line_status", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "ship_date", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "commit_date", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "receipt_date", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "ship_instruct", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "ship_mode", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "comment", + "type": "string", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_address_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_address_main.json new file mode 100644 index 0000000..edd6fee --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_address_main.json @@ -0,0 +1,32 @@ +{ + "dataFlowId": "customer_address_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_customer_address", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_address", + "cdfEnabled": true, + "selectExp": ["customer_id as c_custkey", "address as c_address", "nat_id as c_nationkey", "load_timestamp"] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_address", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_address_schema.json" + }, + "cdcSettings": { + "keys": [ + "c_custkey" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_main.json new file mode 100644 index 0000000..eae678f --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_main.json @@ -0,0 +1,38 @@ +{ + "dataFlowId": "customer_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_customer", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer", + "cdfEnabled": true, + "selectExp": [ + "customer_id as c_custkey", + "name as c_name", + "mktseg as c_mktsegment", + "load_timestamp" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_schema.json" + }, + "cdcSettings": { + "keys": [ + "c_custkey" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false, + "track_history_column_list": ["c_name", "c_mktsegment"] + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_phone_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_phone_main.json new file mode 100644 index 0000000..256385c --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/customer_phone_main.json @@ -0,0 +1,38 @@ +{ + "dataFlowId": "customer_phone_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_customer_phone", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "customer_phone", + "cdfEnabled": true, + "selectExp": [ + "customer_id as c_custkey", + "ptype as c_phonetype", + "phone as c_phone", + "load_timestamp" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "customer_phone", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "customer_phone_schema.json" + }, + "cdcSettings": { + "keys": [ + "c_custkey", + "c_phonetype" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/lineitem_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/lineitem_main.json new file mode 100644 index 0000000..4370a2a --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/lineitem_main.json @@ -0,0 +1,32 @@ +{ + "dataFlowId": "lineitem_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_lineitem", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "lineitem", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "lineitem", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "lineitem_schema.json" + }, + "cdcSettings": { + "keys": [ + "l_orderkey" + ,"l_linenumber" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/nation_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/nation_main.json new file mode 100644 index 0000000..fc9d2ef --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/nation_main.json @@ -0,0 +1,30 @@ +{ + "dataFlowId": "nation_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_nation", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "nation", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "nation", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "nation_schema.json" + }, + "cdcSettings": { + "keys": [ + "n_nationkey" + ], + "scd_type": "1", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"] + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/orders_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/orders_main.json new file mode 100644 index 0000000..af34223 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/orders_main.json @@ -0,0 +1,31 @@ +{ + "dataFlowId": "orders_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_orders", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "orders", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "orders", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "orders_schema.json" + }, + "cdcSettings": { + "keys": [ + "o_orderkey" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/part_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/part_main.json new file mode 100644 index 0000000..efe0c9a --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/part_main.json @@ -0,0 +1,31 @@ +{ + "dataFlowId": "part_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_part", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "part", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "part", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "part_schema.json" + }, + "cdcSettings": { + "keys": [ + "p_partkey" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/region_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/region_main.json new file mode 100644 index 0000000..2fe0775 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/region_main.json @@ -0,0 +1,30 @@ +{ + "dataFlowId": "region_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_region", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "region", + "cdfEnabled": true + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "region", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "region_schema.json" + }, + "cdcSettings": { + "keys": [ + "r_regionkey" + ], + "scd_type": "1", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"] + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_address_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_address_main.json new file mode 100644 index 0000000..61fb4ba --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_address_main.json @@ -0,0 +1,32 @@ +{ + "dataFlowId": "supplier_address_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_supplier_address", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "supplier_address", + "cdfEnabled": true, + "selectExp": ["supplier_id as s_suppkey", "address as s_address", "nat_id as s_nationkey", "load_timestamp"] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "supplier_address", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "supplier_address_schema.json" + }, + "cdcSettings": { + "keys": [ + "s_suppkey" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_main.json new file mode 100644 index 0000000..4e6aa61 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_main.json @@ -0,0 +1,37 @@ +{ + "dataFlowId": "supplier_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_supplier", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "supplier", + "cdfEnabled": true, + "selectExp": [ + "supplier_id as s_suppkey", + "name as s_name", + "load_timestamp" + ] + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "supplier", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "supplier_schema.json" + }, + "cdcSettings": { + "keys": [ + "s_suppkey" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false, + "track_history_column_list": ["s_name"] + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_phone_main.json b/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_phone_main.json new file mode 100644 index 0000000..f0c4af0 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/dataflowspec/supplier_phone_main.json @@ -0,0 +1,33 @@ +{ + "dataFlowId": "supplier_phone_silver", + "dataFlowGroup": "tpch_silver", + "dataFlowType": "standard", + "sourceSystem": "tpch", + "sourceType": "delta", + "sourceViewName": "v_supplier_phone", + "sourceDetails": { + "database": "{bronze_schema}", + "table": "supplier_phone", + "cdfEnabled": true, + "selectExp": ["supplier_id as s_suppkey", "phone as s_phone", "load_timestamp"] + + }, + "mode": "stream", + "targetFormat": "delta", + "targetDetails": { + "table": "supplier_phone", + "tableProperties": { + "delta.enableChangeDataFeed": "true" + }, + "schemaPath": "supplier_phone_schema.json" + }, + "cdcSettings": { + "keys": [ + "s_suppkey" + ], + "scd_type": "2", + "sequence_by": "load_timestamp", + "except_column_list": ["load_timestamp"], + "ignore_null_updates": false + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/expectations/customer_address_dqe.json b/samples/tpch_sample/src/dataflows/silver/expectations/customer_address_dqe.json new file mode 100644 index 0000000..7405996 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/expectations/customer_address_dqe.json @@ -0,0 +1,15 @@ +{ + "expect_or_drop": [ + { + "name": "PK not null", + "constraint": "CUSTOMER_ID IS NOT NULL", + "tag": "Validity" + }, + { + "name": "enabledTest", + "constraint": "CUSTOMER_ID = 1", + "tag": "Validity", + "enabled": false + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/customer_address_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/customer_address_schema.json new file mode 100644 index 0000000..43c9560 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "c_custkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "c_address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "c_nationkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/customer_phone_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/customer_phone_schema.json new file mode 100644 index 0000000..8a3e7ec --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/customer_phone_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "c_custkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "c_phonetype", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "c_phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/customer_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/customer_schema.json new file mode 100644 index 0000000..56f18cb --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/customer_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "c_custkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "c_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "c_mktsegment", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/lineitem_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/lineitem_schema.json new file mode 100644 index 0000000..c243e72 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/lineitem_schema.json @@ -0,0 +1,107 @@ +{ + "type": "struct", + "fields": [ + { + "name": "l_orderkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "l_partkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "l_suppkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "l_linenumber", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "l_quantity", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "l_extendedprice", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "l_discount", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "l_tax", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "l_returnflag", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "l_linestatus", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "l_shipdate", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "l_commitdate", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "l_receiptdate", + "type": "date", + "nullable": false, + "metadata": {} + }, + { + "name": "l_shipinstruct", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "l_shipmode", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "l_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/nation_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/nation_schema.json new file mode 100644 index 0000000..74dc100 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/nation_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "n_nationkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "n_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "n_regionkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/orders_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/orders_schema.json new file mode 100644 index 0000000..37ff52f --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/orders_schema.json @@ -0,0 +1,65 @@ +{ + "type": "struct", + "fields": [ + { + "name": "o_orderkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "o_custkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "o_orderstatus", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_totalprice", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "o_orderdate", + "type": "date", + "nullable": true, + "metadata": {} + }, + { + "name": "o_orderpriority", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_clerk", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "o_shippriority", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "o_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/part_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/part_schema.json new file mode 100644 index 0000000..6b76df4 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/part_schema.json @@ -0,0 +1,65 @@ +{ + "type": "struct", + "fields": [ + { + "name": "p_partkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "p_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_mfgr", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_brand", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_type", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_size", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "p_container", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "p_retailprice", + "type": "decimal(18,2)", + "nullable": false, + "metadata": {} + }, + { + "name": "p_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/region_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/region_schema.json new file mode 100644 index 0000000..ee6a8bd --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/region_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "r_regionkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "r_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "r_comment", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/supplier_address_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/supplier_address_schema.json new file mode 100644 index 0000000..c828cc3 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/supplier_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "s_suppkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "s_address", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "s_nationkey", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/supplier_phone_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/supplier_phone_schema.json new file mode 100644 index 0000000..da5bd38 --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/supplier_phone_schema.json @@ -0,0 +1,23 @@ +{ + "type": "struct", + "fields": [ + { + "name": "s_suppkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "s_phone", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/dataflows/silver/schemas/supplier_schema.json b/samples/tpch_sample/src/dataflows/silver/schemas/supplier_schema.json new file mode 100644 index 0000000..d8cc24f --- /dev/null +++ b/samples/tpch_sample/src/dataflows/silver/schemas/supplier_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "s_suppkey", + "type": "long", + "nullable": false, + "metadata": {} + }, + { + "name": "s_name", + "type": "string", + "nullable": false, + "metadata": {} + }, + { + "name": "s_acctbal", + "type": "decimal(18,2)", + "nullable": true, + "metadata": {} + }, + { + "name": "load_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/tpch_sample/src/pipeline_configs/dev_substitutions.json b/samples/tpch_sample/src/pipeline_configs/dev_substitutions.json new file mode 100644 index 0000000..f0483a0 --- /dev/null +++ b/samples/tpch_sample/src/pipeline_configs/dev_substitutions.json @@ -0,0 +1,9 @@ +{ + "tokens": { + "staging_schema": "main.lakeflow_samples_tpch_staging{logical_env}", + "bronze_schema": "main.lakeflow_samples_tpch_bronze{logical_env}", + "silver_schema": "main.lakeflow_samples_tpch_silver{logical_env}", + "gold_schema": "main.lakeflow_samples_tpch_gold{logical_env}", + "staging_source_root": "/Volumes/main/lakeflow_samples_tpch_staging{logical_env}/stg_volume/tpc_h" + } +} \ No newline at end of file diff --git a/samples/tpch_sample/src/test_data/initialize.ipynb b/samples/tpch_sample/src/test_data/initialize.ipynb new file mode 100644 index 0000000..581f32e --- /dev/null +++ b/samples/tpch_sample/src/test_data/initialize.ipynb @@ -0,0 +1,75 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Initialize Variables" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "dbutils.widgets.text(\"catalog\", \"main\")\n", + "dbutils.widgets.text(\"schema_namespace\", \"lakeflow_samples\")\n", + "dbutils.widgets.text(\"logical_env\", \"\")\n", + "\n", + "catalog = dbutils.widgets.get(\"catalog\")\n", + "schema_namespace = dbutils.widgets.get(\"schema_namespace\")\n", + "logical_env = dbutils.widgets.get(\"logical_env\")\n", + "\n", + "staging_schema = f'{catalog}.{schema_namespace}_staging{logical_env}'\n", + "bronze_schema = f'{catalog}.{schema_namespace}_bronze{logical_env}'\n", + "silver_schema = f'{catalog}.{schema_namespace}_silver{logical_env}'\n", + "gold_schema = f'{catalog}.{schema_namespace}_gold{logical_env}'\n", + "staging_volume = \"stg_volume\"\n", + "\n", + "sample_source_schema = 'samples.tpch'\n", + "\n", + "volume_root_file_path = f\"/Volumes/{staging_schema}/{staging_volume}/tpc_h\".replace(\".\", \"/\")\n", + "\n", + "current_time_str = datetime.now().strftime(\"%Y_%m_%d_%H_%M_%S\")\n", + "\n", + "writer_options = {\n", + " \"header\": \"true\",\n", + " \"compression\": \"gzip\"\n", + "}" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/tpch_sample/src/test_data/setup_data.ipynb b/samples/tpch_sample/src/test_data/setup_data.ipynb new file mode 100644 index 0000000..b2a20b8 --- /dev/null +++ b/samples/tpch_sample/src/test_data/setup_data.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%run \"./initialize\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Schemas and Tables" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {staging_schema}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {bronze_schema}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {silver_schema}\")\n", + "spark.sql(f\"CREATE SCHEMA IF NOT EXISTS {gold_schema}\")\n", + "spark.sql(f\"CREATE VOLUME IF NOT EXISTS {staging_schema}.{staging_volume}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pyspark.sql.functions as F" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DELETE DATA FROM VOLUME IF EXISTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "def wait_for_deletion(path, max_wait=60, sleep_interval=2):\n", + " \"\"\"\n", + " Wait until the specified path is deleted or timeout occurs.\n", + "\n", + " :param path: Path to check for existence\n", + " :param max_wait: Maximum wait time in seconds\n", + " :param sleep_interval: Time to wait between checks\n", + " \"\"\"\n", + " start_time = time.time()\n", + " \n", + " while time.time() - start_time < max_wait:\n", + " try:\n", + " dbutils.fs.ls(path)\n", + " time.sleep(sleep_interval)\n", + " except Exception as e:\n", + " if \"java.io.FileNotFoundException\" in str(e):\n", + " print(f\"Deletion confirmed: {path} does not exist.\")\n", + " return True\n", + " else:\n", + " raise e\n", + " \n", + " print(f\"Warning: Timeout reached while waiting for {path} to be deleted.\")\n", + " return False\n", + "\n", + "# Trigger deletion\n", + "dbutils.fs.rm(f\"{volume_root_file_path}\", True)\n", + "\n", + "# Wait until fully deleted\n", + "wait_for_deletion(volume_root_file_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SUPPORTING TABLES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region_df = (\n", + " spark.sql(f\"SELECT * FROM {sample_source_schema}.region\").withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/region/region_{current_time_str}.csv\")\n", + ")\n", + "\n", + "nation_items_df = (\n", + " spark.sql(f\"SELECT * FROM {sample_source_schema}.nation\").withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/nation1/nation_{current_time_str}.csv\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CUSTOMER TABLES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customer_df = (\n", + " spark.sql(f\"SELECT c_custkey as customer_id, c_name as name, c_acctbal as acctbal, c_mktsegment as mktseg FROM {sample_source_schema}.customer\").dropDuplicates().withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/customer/customer_{current_time_str}.csv\")\n", + ")\n", + "\n", + "customer_address_df = (\n", + " spark.sql(f\"SELECT c_custkey as customer_id, c_address as address, c_nationkey as nat_id FROM {sample_source_schema}.customer\").dropDuplicates().withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/customer_address/customer_address_{current_time_str}.csv\")\n", + ")\n", + "\n", + "customer_phone_df = (\n", + " spark.sql(f\"SELECT c_custkey as customer_id, 'M' as type, c_phone as phone FROM {sample_source_schema}.customer\").dropDuplicates().withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/customer_phone/customer_phone{current_time_str}.csv\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SUPPLIER TABLES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "supplier_df = (\n", + " spark.sql(f\"SELECT s_suppkey as supplier_id, s_name as name, s_acctbal as acctbal, s_comment as comment FROM {sample_source_schema}.supplier\").dropDuplicates().withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/supplier/supplier_{current_time_str}.csv\")\n", + ")\n", + "\n", + "supplier_address_df = (\n", + " spark.sql(f\"SELECT s_suppkey as supplier_id, s_address as address, s_nationkey as nat_id FROM {sample_source_schema}.supplier\").dropDuplicates().withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/supplier_address/supplier_address_{current_time_str}.csv\")\n", + ")\n", + "\n", + "supplier_phone_df = (\n", + " spark.sql(f\"SELECT s_suppkey as supplier_id, s_phone as phone FROM {sample_source_schema}.supplier\").dropDuplicates().withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/supplier_phone/supplier_phone_{current_time_str}.csv\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ORDERS TABLES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "orders_df = (\n", + " spark.sql(f\"SELECT * FROM {sample_source_schema}.orders\").withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/orders/orders_{current_time_str}.csv\")\n", + ")\n", + "\n", + "line_items_df = (\n", + " spark.sql(f\"SELECT * FROM {sample_source_schema}.lineitem\").withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/lineitem/lineitem_{current_time_str}.csv\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PART TABLES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "part_df = (\n", + " spark.sql(f\"SELECT * FROM {sample_source_schema}.part\").withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/part/part_{current_time_str}.csv\")\n", + ")\n", + "\n", + "partsupp_df = (\n", + " spark.sql(f\"SELECT * FROM {sample_source_schema}.partsupp\").withColumn(\"load_timestamp\", F.current_timestamp())\n", + " .write.format(\"csv\").mode(\"overwrite\").options(**writer_options)\n", + " .save(f\"{volume_root_file_path}/partssupp/partssupp_{current_time_str}.csv\")\n", + ")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/samples/tpch_sample/tests/main_test.py b/samples/tpch_sample/tests/main_test.py new file mode 100644 index 0000000..333ffa3 --- /dev/null +++ b/samples/tpch_sample/tests/main_test.py @@ -0,0 +1,6 @@ +from bronze_sample.main import get_taxis, get_spark + + +def test_main(): + taxis = get_taxis(get_spark()) + assert taxis.count() > 5 diff --git a/samples/yaml_sample/.gitignore b/samples/yaml_sample/.gitignore new file mode 100644 index 0000000..bc4bd13 --- /dev/null +++ b/samples/yaml_sample/.gitignore @@ -0,0 +1,7 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/ diff --git a/samples/yaml_sample/.vscode/settings.json b/samples/yaml_sample/.vscode/settings.json new file mode 100644 index 0000000..1a79a81 --- /dev/null +++ b/samples/yaml_sample/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------" +} \ No newline at end of file diff --git a/samples/yaml_sample/databricks.yml b/samples/yaml_sample/databricks.yml new file mode 100644 index 0000000..870637f --- /dev/null +++ b/samples/yaml_sample/databricks.yml @@ -0,0 +1,37 @@ +# This is a Databricks asset bundle definition for bronze_sample. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: yaml_sample + +include: + - scratch/resources/*.yml + +variables: + catalog: + description: The target UC catalog + framework_source_path: + description: The full workspace path to the framwework src folder + schema: + description: The target UC schema + workspace_host: + description: workspace url used for API calls from Framework (usually same as deployment URL) e.g. https://e2-demo-field-eng.cloud.databricks.com/ + layer: + description: The target layer + default: bronze + logical_env: + description: The logical environment + default: "" + pipeline_cluster_config: + description: Basic cluster config, add node types as necessary + default: + label: default + autoscale: + min_workers: 1 + max_workers: 5 + mode: ENHANCED + +targets: + # The 'dev' target, for development purposes. This target is the default. + dev: + mode: development + default: true diff --git a/samples/yaml_sample/fixtures/.gitkeep b/samples/yaml_sample/fixtures/.gitkeep new file mode 100644 index 0000000..fa25d27 --- /dev/null +++ b/samples/yaml_sample/fixtures/.gitkeep @@ -0,0 +1,22 @@ +# Fixtures + +This folder is reserved for fixtures, such as CSV files. + +Below is an example of how to load fixtures as a data frame: + +``` +import pandas as pd +import os + +def get_absolute_path(*relative_parts): + if 'dbutils' in globals(): + base_dir = os.path.dirname(dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()) # type: ignore + path = os.path.normpath(os.path.join(base_dir, *relative_parts)) + return path if path.startswith("/Workspace") else "/Workspace" + path + else: + return os.path.join(*relative_parts) + +csv_file = get_absolute_path("..", "fixtures", "mycsv.csv") +df = pd.read_csv(csv_file) +display(df) +``` diff --git a/samples/yaml_sample/pytest.ini b/samples/yaml_sample/pytest.ini new file mode 100644 index 0000000..80432c2 --- /dev/null +++ b/samples/yaml_sample/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = src diff --git a/samples/yaml_sample/resources/classic/yaml_sample_pipeline.yml b/samples/yaml_sample/resources/classic/yaml_sample_pipeline.yml new file mode 100644 index 0000000..c86edd7 --- /dev/null +++ b/samples/yaml_sample/resources/classic/yaml_sample_pipeline.yml @@ -0,0 +1,22 @@ +resources: + pipelines: + lakeflow_samples_yaml_sample_pipeline: + name: Lakeflow Framework - YAML Sample Pipeline (${var.logical_env}) + channel: CURRENT + clusters: + - ${var.pipeline_cluster_config} + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: yaml_samples + root_path: ${workspace.file_path}/src/dataflows/yaml_samples \ No newline at end of file diff --git a/samples/yaml_sample/resources/serverless/yaml_sample_pipeline.yml b/samples/yaml_sample/resources/serverless/yaml_sample_pipeline.yml new file mode 100644 index 0000000..19c4c7b --- /dev/null +++ b/samples/yaml_sample/resources/serverless/yaml_sample_pipeline.yml @@ -0,0 +1,23 @@ +resources: + pipelines: + lakeflow_samples_yaml_sample_pipeline: + name: Lakeflow Framework - YAML Sample Pipeline (${var.logical_env}) + channel: CURRENT + serverless: true + catalog: ${var.catalog} + schema: ${var.schema} + libraries: + - notebook: + path: ${var.framework_source_path}/dlt_pipeline + + configuration: + bundle.sourcePath: ${workspace.file_path}/src + bundle.target: ${bundle.target} + framework.sourcePath: ${var.framework_source_path} + workspace.host: ${var.workspace_host} + pipeline.layer: ${var.layer} + logicalEnv: ${var.logical_env} + pipeline.dataFlowGroupFilter: yaml_samples + + root_path: ${workspace.file_path}/src/dataflows/yaml_samples + diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/append_view_flow_main.yaml b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/append_view_flow_main.yaml new file mode 100644 index 0000000..ec857d6 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/append_view_flow_main.yaml @@ -0,0 +1,29 @@ +# Metadata +dataFlowId: append_view_flow +dataFlowGroup: yaml_samples +dataFlowType: flow + +# Target Configuration +targetFormat: delta +targetDetails: + table: append_view_flow + tableProperties: + delta.enableChangeDataFeed: 'true' + +# Flow Groups Configuration +flowGroups: +- flowGroupId: main + flows: + f_customer_append_view: + flowType: append_view + flowDetails: + targetTable: append_view_flow + sourceView: v_append_view_flow + views: + v_append_view_flow: + mode: stream + sourceType: delta + sourceDetails: + database: '{staging_schema}' + table: customer + cdfEnabled: true diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_address_main.yaml b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_address_main.yaml new file mode 100644 index 0000000..794f9f2 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_address_main.yaml @@ -0,0 +1,29 @@ +# Metadata +dataFlowId: base_customer_address +dataFlowGroup: yaml_samples +dataFlowType: standard + +# Source Configuration +sourceSystem: testSystem +sourceType: delta +sourceViewName: v_customer_address +sourceDetails: + database: '{staging_schema}' + table: customer_address + cdfEnabled: true +mode: stream + +# Target Configuration +targetFormat: delta +targetDetails: + table: customer_address + tableProperties: + delta.enableChangeDataFeed: 'true' + schemaPath: customer_address_schema.json + +# Data Quality and Quarantine Configuration +dataQualityExpectationsEnabled: true +dataQualityExpectationsPath: ./customer_address_dqe.yaml +quarantineMode: table +quarantineTargetDetails: + targetFormat: delta diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_file_source_main.yaml b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_file_source_main.yaml new file mode 100644 index 0000000..89b5b40 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_file_source_main.yaml @@ -0,0 +1,39 @@ +# Metadata +dataFlowId: base_customer_file_source +dataFlowGroup: yaml_samples +dataFlowType: standard + +# Source Configuration +sourceSystem: testSystem +sourceType: cloudFiles +sourceViewName: v_customer_files +sourceDetails: + path: '{sample_file_location}/customer/' + readerOptions: + cloudFiles.format: csv + header: 'true' + schemaPath: ./customer_file_source_schema.json + selectExp: + - CUSTOMER_ID + - FIRST_NAME + - LAST_NAME + - EMAIL + - CAST(LOAD_TIMESTAMP AS TIMESTAMP) AS LOAD_TIMESTAMP +mode: stream + +# Target Configuration +targetFormat: delta +targetDetails: + table: customer_file_sample + tableProperties: + delta.enableChangeDataFeed: 'true' + schemaPath: customer_schema.json + +# CDC Configuration +cdcSettings: + keys: + - CUSTOMER_ID + sequence_by: LOAD_TIMESTAMP + where: '' + ignore_null_updates: false + scd_type: '1' diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_main.yaml b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_main.yaml new file mode 100644 index 0000000..4ab94a5 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/customer_main.yaml @@ -0,0 +1,22 @@ +# Metadata +dataFlowId: base_customer +dataFlowGroup: yaml_samples +dataFlowType: standard + +# Source Configuration +sourceSystem: testSystem +sourceType: delta +sourceViewName: v_customer +sourceDetails: + database: '{staging_schema}' + table: customer + cdfEnabled: true +mode: stream + +# Target Configuration +targetFormat: delta +targetDetails: + table: customer + tableProperties: + delta.enableChangeDataFeed: 'true' + schemaPath: customer_schema.json diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/template_samples_main.yaml b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/template_samples_main.yaml new file mode 100644 index 0000000..94802b4 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/dataflowspec/template_samples_main.yaml @@ -0,0 +1,18 @@ +template: cdc_stream_from_snapshot_template +parameterSets: + + - dataFlowId: template_customer + sourceTable: customer + targetTable: customer_template + cdcKeys: + - CUSTOMER_ID + sequenceByColumn: LOAD_TIMESTAMP + schemaPath: customer_schema.json + + - dataFlowId: template_customer_address + sourceTable: customer_address + targetTable: customer_address_template + cdcKeys: + - CUSTOMER_ID + sequenceByColumn: LOAD_TIMESTAMP + schemaPath: customer_address_schema.json diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/dml/feature_mv_sql_path.sql b/samples/yaml_sample/src/dataflows/yaml_samples/dml/feature_mv_sql_path.sql new file mode 100644 index 0000000..98bc595 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/dml/feature_mv_sql_path.sql @@ -0,0 +1 @@ +SELECT * FROM {staging_schema}.customer \ No newline at end of file diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/expectations/customer_address_dqe.yaml b/samples/yaml_sample/src/dataflows/yaml_samples/expectations/customer_address_dqe.yaml new file mode 100644 index 0000000..c06e2b9 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/expectations/customer_address_dqe.yaml @@ -0,0 +1,8 @@ +expect_or_drop: +- name: PK not null + constraint: CUSTOMER_ID IS NOT NULL + tag: Validity +- name: enabledTest + constraint: CUSTOMER_ID = 1 + tag: Validity + enabled: false diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_address_schema.json b/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_address_schema.json new file mode 100644 index 0000000..23597cf --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_address_schema.json @@ -0,0 +1,29 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "CITY", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "STATE", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_file_source_schema.json b/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_file_source_schema.json new file mode 100644 index 0000000..9194cb7 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_file_source_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_schema.json b/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_schema.json new file mode 100644 index 0000000..9194cb7 --- /dev/null +++ b/samples/yaml_sample/src/dataflows/yaml_samples/schemas/customer_schema.json @@ -0,0 +1,41 @@ +{ + "type": "struct", + "fields": [ + { + "name": "CUSTOMER_ID", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "FIRST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "LAST_NAME", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "EMAIL", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "DELETE_FLAG", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "LOAD_TIMESTAMP", + "type": "timestamp", + "nullable": true, + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/samples/yaml_sample/src/pipeline_configs/dev_secrets.yaml b/samples/yaml_sample/src/pipeline_configs/dev_secrets.yaml new file mode 100644 index 0000000..6f3b0f1 --- /dev/null +++ b/samples/yaml_sample/src/pipeline_configs/dev_secrets.yaml @@ -0,0 +1,4 @@ +kafka_source_bootstrap_servers: + scope: + key: + exceptionEnabled: true diff --git a/samples/yaml_sample/src/pipeline_configs/dev_substitutions.yaml b/samples/yaml_sample/src/pipeline_configs/dev_substitutions.yaml new file mode 100644 index 0000000..a1a1cc8 --- /dev/null +++ b/samples/yaml_sample/src/pipeline_configs/dev_substitutions.yaml @@ -0,0 +1,6 @@ +tokens: + staging_schema: main.lakeflow_samples_staging{logical_env} + bronze_schema: main.lakeflow_samples_bronze{logical_env} + silver_schema: main.lakeflow_samples_silver{logical_env} + staging_volume: stg_volume + sample_file_location: /Volumes/main/lakeflow_samples_staging{logical_env}/stg_volume diff --git a/samples/yaml_sample/src/pipeline_configs/global.json b/samples/yaml_sample/src/pipeline_configs/global.json new file mode 100644 index 0000000..9be8362 --- /dev/null +++ b/samples/yaml_sample/src/pipeline_configs/global.json @@ -0,0 +1,6 @@ +{ + "pipeline_bundle_spec_format": { + "format": "yaml" + }, + "table_migration_state_volume_path": "/Volumes/main/lakeflow_samples_staging_es/stg_volume/checkpoint_state" +} \ No newline at end of file diff --git a/samples/yaml_sample/src/templates/cdc_stream_from_snapshot_template.yaml b/samples/yaml_sample/src/templates/cdc_stream_from_snapshot_template.yaml new file mode 100644 index 0000000..c390bdf --- /dev/null +++ b/samples/yaml_sample/src/templates/cdc_stream_from_snapshot_template.yaml @@ -0,0 +1,92 @@ +name: cdc_stream_from_snapshot_template + +# Parameter Definitions +parameters: + dataFlowId: + type: string + required: true + cdcKeys: + type: list + required: true + sourceTable: + type: string + required: true + targetTable: + type: string + required: true + sequenceByColumn: + type: string + required: true + schemaPath: + type: string + required: true + +# Template Definition +template: + dataFlowId: ${param.dataFlowId} + dataFlowGroup: yaml_samples + dataFlowType: flow + targetFormat: delta + targetDetails: + table: ${param.targetTable} + tableProperties: + delta.enableChangeDataFeed: "true" + cdcSettings: + keys: ${param.cdcKeys} + scd_type: "2" + sequence_by: ${param.sequenceByColumn} + except_column_list: + - ${param.sequenceByColumn} + - is_delete + ignore_null_updates: false + apply_as_deletes: is_delete = 1 + flowGroups: + - flowGroupId: main + stagingTables: + stg_${param.dataFlowId}_${param.sourceTable}: + type: ST + tableProperties: + delta.enableChangeDataFeed: "true" + cdcSnapshotSettings: + keys: ${param.cdcKeys} + scd_type: "1" + snapshotType: historical + sourceType: file + source: + format: csv + path: "{sample_file_location}/template_samples/snapshot_${param.sourceTable}/${param.sourceTable}_{version}.csv" + readerOptions: + header: "true" + versionType: timestamp + datetimeFormat: "%Y_%m_%d" + schemaPath: ${param.schemaPath} + selectExp: + - "* EXCEPT(${param.sequenceByColumn})" + - "TO_TIMESTAMP(${param.sequenceByColumn}, 'yyyy-MM-dd HH:mm:ss') AS ${param.sequenceByColumn}" + - "_metadata AS meta_file_metadata" + configFlags: + - disableOperationalMetadata + flows: + f_${param.dataFlowId}_${param.sourceTable}_merge_flow: + flowType: merge + flowDetails: + targetTable: ${param.targetTable} + sourceView: v_stg_${param.dataFlowId}_${param.sourceTable} + views: + v_stg_${param.dataFlowId}_${param.sourceTable}: + mode: stream + sourceType: delta + sourceDetails: + database: live + table: stg_${param.dataFlowId}_${param.sourceTable} + cdfEnabled: true + selectExp: + - "*" + startingVersionFromDLTSetup: true + cdfChangeTypeOverride: + - insert + - update_postimage + - delete + pythonTransform: + functionPath: explode_deletes_function_transform.py + diff --git a/samples/yaml_sample/src/templates/python_functions/explode_deletes_function_transform.py b/samples/yaml_sample/src/templates/python_functions/explode_deletes_function_transform.py new file mode 100644 index 0000000..3663a26 --- /dev/null +++ b/samples/yaml_sample/src/templates/python_functions/explode_deletes_function_transform.py @@ -0,0 +1,27 @@ +from pyspark.sql import DataFrame +from pyspark.sql import functions as F + +def apply_transform(df: DataFrame) -> DataFrame: + """ + Duplicates delete records and adjusts sequence_by timestamp. + For deletes: is_delete=0 gets +1ms, is_delete=1 gets +2ms. + """ + # Create array: [0,1] for deletes, [0] for others, then explode + sequence_column = "LOAD_TIMESTAMP" + change_type_column = "meta_cdc_operation" + + is_delete = F.col(change_type_column) == "delete" + array_col = F.when(is_delete, F.array(F.lit(0), F.lit(1))).otherwise(F.array(F.lit(0))) + + return ( + df.withColumnRenamed("_change_type", change_type_column) + .withColumn("is_delete", F.explode(array_col)) + .withColumn( + sequence_column, + F.when(is_delete & (F.col("is_delete") == 0), + F.col(sequence_column) + F.expr("INTERVAL 1 millisecond")) + .when(is_delete & (F.col("is_delete") == 1), + F.col(sequence_column) + F.expr("INTERVAL 2 millisecond")) + .otherwise(F.col(sequence_column)) + ) + ) diff --git a/samples/yaml_sample/tests/main_test.py b/samples/yaml_sample/tests/main_test.py new file mode 100644 index 0000000..333ffa3 --- /dev/null +++ b/samples/yaml_sample/tests/main_test.py @@ -0,0 +1,6 @@ +from bronze_sample.main import get_taxis, get_spark + + +def test_main(): + taxis = get_taxis(get_spark()) + assert taxis.count() > 5 diff --git a/scratch/README.md b/scratch/README.md new file mode 100644 index 0000000..e6cfb81 --- /dev/null +++ b/scratch/README.md @@ -0,0 +1,4 @@ +# scratch + +This folder is reserved for personal, exploratory notebooks. +By default these are not committed to Git, as 'scratch' is listed in .gitignore. diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..e3b5a7d --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,173 @@ +# Scripts + +This directory contains utility scripts and tools for the Lakeflow Framework project. + +## Available Scripts + +### `validate_dataflows.py` + +Validates dataflow JSON/YAML specification files against the project's JSON schemas. + +**Usage:** +```bash +# Validate all dataflow files in the project (with version mapping by default) +python scripts/validate_dataflows.py + +# Validate files in a specific directory +python scripts/validate_dataflows.py samples/bronze_sample/ + +# Validate a single file +python scripts/validate_dataflows.py samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_main.json + +# Validate without version mapping (strict mode - validates against current schema only) +python scripts/validate_dataflows.py --no-mapping samples/bronze_sample/ + +# Verbose output +python scripts/validate_dataflows.py -v +``` + +**What it validates:** +- Searches for all `*_main.json` files in `dataflows/**/dataflowspec/` directories +- Validates against `src/schemas/main.json` (which routes to appropriate sub-schemas) +- Reports validation errors with clear messages + +**Version Mapping (enabled by default):** +- Automatically detects `dataFlowVersion` property in spec files +- Applies version-specific transformations from `src/config/dataflow_spec_mapping/{version}/` +- Transforms old property names to current schema (e.g., `cdcApplyChanges` → `cdcSettings`) +- Useful for validating legacy spec files against the current schema +- Shows which files had mappings applied with a version indicator `[v0.1.0]` +- Use `--no-mapping` flag to disable this behavior and validate strictly against current schema + +**Requirements:** +- Python 3.9+ +- `jsonschema` package (install with: `pip install jsonschema`) + +**Exit codes:** +- `0`: All validations passed +- `1`: One or more validations failed or error occurred + +--- + +### `convert_json_to_yaml.py` + +Converts Lakeflow Framework pipeline bundles from JSON format to YAML format, with optional validation. + +**Features:** +- Converts dataflow specifications, flow groups, expectations, substitutions, and secrets files +- Automatically updates file extensions (e.g., `*_main.json` → `*_main.yaml`) +- Updates references to DQE files (`.json` → `.yaml` in `dataQualityExpectationsPath`) +- Optional validation using `validate_dataflows.py` +- Bundle-level or single-file conversion +- Dry-run mode to preview changes + +**Usage:** +```bash +# Convert a single file +python scripts/convert_json_to_yaml.py --file path/to/file.json + +# Convert an entire bundle +python scripts/convert_json_to_yaml.py --bundle samples/bronze_sample --output samples/bronze_sample_yaml + +# Convert with overwrite +python scripts/convert_json_to_yaml.py --bundle samples/bronze_sample --output samples/bronze_sample_yaml --overwrite + +# Dry run (preview changes without making them) +python scripts/convert_json_to_yaml.py --bundle samples/bronze_sample --dry-run + +# Convert without validation +python scripts/convert_json_to_yaml.py --bundle samples/bronze_sample --no-validate +``` + +**Command-Line Options:** +- `--file PATH` - Convert a single JSON file to YAML +- `--bundle PATH` - Convert an entire bundle directory +- `--output PATH` - Output location (file or directory) +- `--overwrite` - Overwrite existing files/directories +- `--convert-schemas` - Also convert schema JSON files (default: skip) +- `--no-validate` - Skip validation (validation using `validate_dataflows.py` is enabled by default) +- `--dry-run` - Preview changes without modifying files + +**File Type Conversions:** +| Original (JSON) | Converted (YAML) | +|----------------|------------------| +| `*_main.json` | `*_main.yaml` | +| `*_flow.json` | `*_flow.yaml` | +| `*_dqe.json` | `*_expectations.yaml` | +| `*_substitutions.json` | `*_substitutions.yaml` | +| `*_secrets.json` | `*_secrets.yaml` | + +**Validation:** +- By default, runs `validate_dataflows.py` on converted files +- Validates all `*_main.yaml` dataflow specifications +- Uses version mapping to support legacy specs +- Reports validation results in the conversion summary + +**Example Output:** +``` +Converting bundle from samples/bronze_sample to samples/bronze_sample_yaml +Copying bundle structure... +Scanning for JSON files to convert... +Found 34 JSON files to convert + Converting: src/dataflows/base_samples/dataflowspec/customer_main.json -> customer_main.yaml + Converting: src/pipeline_configs/dev_substitutions.json -> dev_substitutions.yaml +... + +================================================================================ +Validating converted files... +================================================================================ +Validating 30 file(s) (with version mapping)... +✓ All dataflow files validated successfully + +================================================================================ +Conversion Summary: +================================================================================ +Files converted: 34 + - Main specs: 30 + - Flow groups: 0 + - Expectations: 2 + - Secrets: 1 + - Substitutions: 1 +Files removed: 34 +Validation: PASSED +================================================================================ +``` + +**Requirements:** +- Python 3.9+ +- `pyyaml` package (install with: `pip install pyyaml`) +- `jsonschema` package (for validation, install with: `pip install jsonschema`) + +**Exit codes:** +- `0`: Success (all files converted successfully) +- `1`: Error occurred (validation failures, conversion errors, or missing files) + +**Programmatic Usage:** +```python +from convert_json_to_yaml import json_to_yaml_basic, convert_bundle, convert_json_file_to_yaml + +# Convert a single file +convert_json_file_to_yaml( + input_path="path/to/file.json", + output_path="path/to/file.yaml" +) + +# Convert an entire bundle +stats = convert_bundle( + source_bundle_path="samples/bronze_sample", + target_bundle_path="samples/bronze_sample_yaml", + validate=True, + overwrite=True +) + +print(f"Converted {stats['converted_files']} files") +``` + +--- + +## Notes + +- The converted YAML files are fully compatible with the Lakeflow Framework's YAML support +- Both JSON and YAML formats are supported by the framework +- Schema files typically remain in JSON format (use `--convert-schemas` if needed) + diff --git a/scripts/convert_json_to_yaml.py b/scripts/convert_json_to_yaml.py new file mode 100644 index 0000000..b109972 --- /dev/null +++ b/scripts/convert_json_to_yaml.py @@ -0,0 +1,529 @@ +""" +JSON to YAML Converter for Lakeflow Framework Bundles + +This script provides utilities to convert Lakeflow Framework pipeline bundles from JSON format to YAML format. +It handles conversion of: +- Dataflow specifications (main specs) +- Flow group specifications +- Data quality expectations (DQE) +- Substitution files +- Secrets files + +The converter can optionally validate converted files using the validate_dataflows.py script. + +Usage: + # Convert a single JSON file to YAML with validation + python convert_json_to_yaml.py --file path/to/file.json + + # Convert an entire bundle with validation + python convert_json_to_yaml.py --bundle path/to/bundle --output path/to/output_bundle + + # Convert without validation + python convert_json_to_yaml.py --bundle path/to/bundle --no-validate + + # Dry run to see what would be converted + python convert_json_to_yaml.py --bundle path/to/bundle --dry-run +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path +from typing import Dict, Any, List, Tuple + +import yaml + + +def validate_with_validation_script(target_path: Path) -> Tuple[bool, List[str]]: + """ + Validate dataflow files using the validate_dataflows.py script. + + Args: + target_path: Path to the directory containing dataflows to validate + + Returns: + Tuple of (success, error_messages) + """ + script_dir = Path(__file__).parent + validate_script = script_dir / "validate_dataflows.py" + + if not validate_script.exists(): + return False, [f"Validation script not found: {validate_script}"] + + try: + # Run the validation script + result = subprocess.run( + [sys.executable, str(validate_script), str(target_path)], + capture_output=True, + text=True, + check=False + ) + + # The script returns 0 for success, 1 for failure + if result.returncode == 0: + return True, [] + else: + # Parse error output + errors = result.stdout.split('\n') if result.stdout else [] + return False, errors + + except Exception as e: # pylint: disable=broad-except + return False, [f"Error running validation script: {e}"] + + +def update_dqe_path_extensions(data: Any) -> Any: + """ + Recursively update dataQualityExpectationsPath extensions from .json to .yaml. + + This function walks through the data structure and updates any + dataQualityExpectationsPath properties that end with .json to end with .yaml instead. + + Args: + data: The data structure to update (dict, list, or primitive) + + Returns: + The updated data structure + """ + if isinstance(data, dict): + updated_dict = {} + for key, value in data.items(): + if key == 'dataQualityExpectationsPath' and isinstance(value, str): + # Replace .json extension with .yaml + if value.endswith('.json'): + updated_dict[key] = value[:-5] + '.yaml' + else: + updated_dict[key] = value + else: + # Recursively process nested structures + updated_dict[key] = update_dqe_path_extensions(value) + return updated_dict + elif isinstance(data, list): + return [update_dqe_path_extensions(item) for item in data] + else: + return data + + +def json_to_yaml_basic(json_data: Dict[str, Any]) -> str: + """ + Convert JSON data to YAML format with clean formatting. + + This is the most basic conversion function that handles the core transformation. + + Args: + json_data: Dictionary containing the JSON data + + Returns: + str: YAML formatted string + + Example: + >>> data = {"key": "value", "nested": {"item": 1}} + >>> yaml_str = json_to_yaml_basic(data) + >>> print(yaml_str) + key: value + nested: + item: 1 + """ + return yaml.dump( + json_data, + default_flow_style=False, + sort_keys=False, + allow_unicode=True, + indent=2, + width=120 + ) + + +def convert_json_file_to_yaml( + input_path: str, + output_path: str = None, + overwrite: bool = False +) -> str: + """ + Convert a single JSON file to YAML format. + + Args: + input_path: Path to the input JSON file + output_path: Path for the output YAML file (optional, defaults to same name with .yaml extension) + overwrite: Whether to overwrite existing files + + Returns: + str: Path to the created YAML file + + Raises: + FileNotFoundError: If input file doesn't exist + FileExistsError: If output file exists and overwrite is False + ValueError: If input file is not valid JSON + """ + # Validate input file + input_path = Path(input_path) + if not input_path.exists(): + raise FileNotFoundError(f"Input file not found: {input_path}") + + # Read JSON file + try: + with open(input_path, 'r', encoding='utf-8') as f: + json_data = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON file '{input_path}': {e}") from e + + # Update dataQualityExpectationsPath extensions from .json to .yaml + json_data = update_dqe_path_extensions(json_data) + + # Determine output path + if output_path is None: + output_path = input_path.with_suffix('.yaml') + else: + output_path = Path(output_path) + + # Check if output file exists + if output_path.exists() and not overwrite: + raise FileExistsError(f"Output file already exists: {output_path}. Use overwrite=True to replace.") + + # Create output directory if needed + output_path.parent.mkdir(parents=True, exist_ok=True) + + # Convert and write YAML + yaml_content = json_to_yaml_basic(json_data) + with open(output_path, 'w', encoding='utf-8') as f: + f.write(yaml_content) + + return str(output_path) + + +def get_file_type_and_new_name(file_path: Path) -> Tuple[str, str]: + """ + Determine the file type and generate the appropriate YAML filename. + + Args: + file_path: Path to the JSON file + + Returns: + Tuple of (file_type, new_filename) where file_type is one of: + 'main_spec', 'flow_group', 'expectations', 'secrets', 'substitutions', 'other' + """ + filename = file_path.name + + # Check for main spec files + if filename.endswith('_main.json'): + base_name = filename[:-len('_main.json')] + return 'main_spec', f"{base_name}_main.yaml" + + # Check for flow group files + if filename.endswith('_flow.json'): + base_name = filename[:-len('_flow.json')] + return 'flow_group', f"{base_name}_flow.yaml" + + # Check for secrets files + if filename.endswith('_secrets.json'): + base_name = filename[:-len('_secrets.json')] + return 'secrets', f"{base_name}_secrets.yaml" + + # Check for substitutions files + if filename.endswith('_substitutions.json'): + base_name = filename[:-len('_substitutions.json')] + return 'substitutions', f"{base_name}_substitutions.yaml" + + # Check for expectations files (in expectations directory) + if 'expectations' in str(file_path.parent) or filename.endswith('_dqe.json'): + base_name = filename[:-len('.json')] + return 'expectations', f"{base_name}_expectations.yaml" + + # Other JSON files - typically schemas, don't convert + return 'other', filename + + +def convert_bundle( + source_bundle_path: str, + target_bundle_path: str, + overwrite: bool = False, + convert_schemas: bool = False, + validate: bool = True, + dry_run: bool = False +) -> Dict[str, Any]: + """ + Convert an entire Lakeflow Framework bundle from JSON to YAML format with optional validation. + + This function: + 1. Copies the entire bundle structure to a new location + 2. Converts all relevant JSON files to YAML format: + - Dataflow specifications (*_main.json -> *_main.yaml) + - Flow group specifications (*_flow.json -> *_flow.yaml) + - Data quality expectations (*_dqe.json -> *_expectations.yaml) + - Substitution files (*_substitutions.json -> *_substitutions.yaml) + - Secrets files (*_secrets.json -> *_secrets.yaml) + 3. Removes the original JSON files + 4. Optionally validates converted files using validate_dataflows.py + + Args: + source_bundle_path: Path to the source bundle directory + target_bundle_path: Path to the target bundle directory + overwrite: Whether to overwrite existing target bundle + convert_schemas: Whether to convert schema JSON files (default: False, as schemas typically stay JSON) + validate: Whether to validate converted files using validate_dataflows.py (default: True) + dry_run: If True, only print what would be done without making changes + + Returns: + Dict containing conversion statistics: + { + 'copied_files': int, + 'converted_files': int, + 'removed_files': int, + 'validation_errors': int, + 'errors': List[str], + 'converted_by_type': { + 'main_spec': int, + 'flow_group': int, + 'expectations': int, + 'secrets': int, + 'substitutions': int + } + } + """ + source_path = Path(source_bundle_path) + target_path = Path(target_bundle_path) + + # Validation + if not source_path.exists(): + raise FileNotFoundError(f"Source bundle not found: {source_path}") + + if target_path.exists(): + if not overwrite: + raise FileExistsError( + f"Target bundle already exists: {target_path}. Use overwrite=True to replace." + ) + if not dry_run: + shutil.rmtree(target_path) + + # Initialize statistics + stats = { + 'copied_files': 0, + 'converted_files': 0, + 'removed_files': 0, + 'validation_errors': 0, + 'errors': [], + 'converted_by_type': { + 'main_spec': 0, + 'flow_group': 0, + 'expectations': 0, + 'secrets': 0, + 'substitutions': 0, + 'other': 0 + } + } + + print(f"{'[DRY RUN] ' if dry_run else ''}Converting bundle from {source_path} to {target_path}") + + # Step 1: Copy the entire bundle + print(f"{'[DRY RUN] ' if dry_run else ''}Copying bundle structure...") + if not dry_run: + shutil.copytree(source_path, target_path) + + # Step 2: Find and convert JSON files + print(f"{'[DRY RUN] ' if dry_run else ''}Scanning for JSON files to convert...") + + # Walk through the target directory + json_files_to_convert = [] + for root, _, files in os.walk(target_path if not dry_run else source_path): + for file in files: + if file.endswith('.json'): + file_path = Path(root) / file + relative_path = file_path.relative_to(target_path if not dry_run else source_path) + + # Determine file type + file_type, new_filename = get_file_type_and_new_name(file_path) + + # Skip schema files unless explicitly requested + if file_type == 'other' and not convert_schemas: + continue + + json_files_to_convert.append((file_path, file_type, new_filename, relative_path)) + + print(f"Found {len(json_files_to_convert)} JSON files to convert") + + # Step 3: Convert files + for file_path, file_type, new_filename, relative_path in json_files_to_convert: + try: + print(f" Converting: {relative_path} -> {new_filename}") + + if not dry_run: + # Read JSON + with open(file_path, 'r', encoding='utf-8') as f: + json_data = json.load(f) + + # Update dataQualityExpectationsPath extensions from .json to .yaml + json_data = update_dqe_path_extensions(json_data) + + # Convert to YAML + yaml_content = json_to_yaml_basic(json_data) + + # Write YAML file + yaml_path = file_path.parent / new_filename + with open(yaml_path, 'w', encoding='utf-8') as f: + f.write(yaml_content) + + # Remove original JSON file + file_path.unlink() + + stats['converted_files'] += 1 + stats['removed_files'] += 1 + stats['converted_by_type'][file_type] += 1 + else: + stats['converted_files'] += 1 + stats['converted_by_type'][file_type] += 1 + + except ValueError as e: + error_msg = f"Conversion error in {relative_path}: {str(e)}" + print(f" ERROR: {error_msg}") + stats['errors'].append(error_msg) + except Exception as e: # pylint: disable=broad-except + error_msg = f"Error converting {relative_path}: {str(e)}" + print(f" ERROR: {error_msg}") + stats['errors'].append(error_msg) + + # Step 4: Validate using validate_dataflows.py if requested + if validate and not dry_run: + print("\n" + "="*80) + print("Validating converted files...") + print("="*80) + + validation_success, validation_errors = validate_with_validation_script(target_path) + + if not validation_success: + stats['validation_errors'] = 1 + if validation_errors: + print("\nValidation output:") + for error in validation_errors: + if error.strip(): + print(error) + stats['errors'].extend(validation_errors) + else: + print("✓ All dataflow files validated successfully") + + # Print summary + print("\n" + "="*80) + print("Conversion Summary:") + print("="*80) + print(f"Files converted: {stats['converted_files']}") + print(f" - Main specs: {stats['converted_by_type']['main_spec']}") + print(f" - Flow groups: {stats['converted_by_type']['flow_group']}") + print(f" - Expectations: {stats['converted_by_type']['expectations']}") + print(f" - Secrets: {stats['converted_by_type']['secrets']}") + print(f" - Substitutions: {stats['converted_by_type']['substitutions']}") + if convert_schemas: + print(f" - Other (schemas, etc.): {stats['converted_by_type']['other']}") + print(f"Files removed: {stats['removed_files']}") + if validate: + if stats['validation_errors'] > 0: + print("Validation: FAILED") + else: + print("Validation: PASSED") + if stats['errors']: + print(f"\nErrors encountered: {len(stats['errors'])}") + # Don't print all the errors here as they were already printed above + print("="*80) + + return stats + + +def main(): + """Main CLI entry point.""" + parser = argparse.ArgumentParser( + description="Convert Lakeflow Framework bundles from JSON to YAML format with optional validation" + ) + + # Add arguments + parser.add_argument( + '--file', + type=str, + help='Convert a single JSON file to YAML' + ) + parser.add_argument( + '--bundle', + type=str, + help='Path to source bundle directory to convert' + ) + parser.add_argument( + '--output', + type=str, + help='Path to output location (for --file or --bundle)' + ) + parser.add_argument( + '--overwrite', + action='store_true', + help='Overwrite existing files/directories' + ) + parser.add_argument( + '--convert-schemas', + action='store_true', + help='Also convert schema JSON files (by default they are skipped)' + ) + parser.add_argument( + '--no-validate', + action='store_true', + help='Skip validation (validation using validate_dataflows.py is enabled by default)' + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Print what would be done without making changes' + ) + + args = parser.parse_args() + + # Validate arguments + if not args.file and not args.bundle: + parser.error("Either --file or --bundle must be specified") + + if args.file and args.bundle: + parser.error("Cannot specify both --file and --bundle") + + try: + if args.file: + # Convert single file + output_path = convert_json_file_to_yaml( + args.file, + args.output, + args.overwrite + ) + print(f"Successfully converted: {args.file} -> {output_path}") + + elif args.bundle: + # Convert bundle + if not args.output: + # Default output is source + "_yaml" suffix + source_path = Path(args.bundle) + args.output = str(source_path.parent / f"{source_path.name}_yaml") + + stats = convert_bundle( + args.bundle, + args.output, + args.overwrite, + args.convert_schemas, + validate=not args.no_validate, + dry_run=args.dry_run + ) + + if not args.dry_run: + print(f"\nBundle successfully converted to: {args.output}") + else: + print("\nDry run complete. No files were modified.") + + # Exit with error code if there were errors + if stats['errors']: + sys.exit(1) + + except (FileNotFoundError, FileExistsError, ValueError) as e: + # Expected errors with clear messages + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: # pylint: disable=broad-except + # Unexpected errors + print(f"Unexpected error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/validate_dataflows.py b/scripts/validate_dataflows.py new file mode 100755 index 0000000..e52264e --- /dev/null +++ b/scripts/validate_dataflows.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 +""" +Validate dataflow JSON files against their schemas. + +This script recursively searches for dataflow specification files (*_main.json) +and validates them against the project's JSON schemas. + +Usage: + python scripts/validate_dataflows.py # Validate all dataflow files (with version mapping) + python scripts/validate_dataflows.py samples/bronze_sample/ # Validate all files in specific directory + python scripts/validate_dataflows.py path/to/file_main.json # Validate specific file + python scripts/validate_dataflows.py --no-mapping # Validate without applying version mappings + +Examples: + # From project root + python scripts/validate_dataflows.py + + # Validate only bronze samples + python scripts/validate_dataflows.py samples/bronze_sample/ + + # Validate without version mapping (strict validation against current schema) + python scripts/validate_dataflows.py --no-mapping samples/bronze_sample/ + + # Validate a single file + python scripts/validate_dataflows.py samples/bronze_sample/src/dataflows/base_samples/dataflowspec/customer_main.json +""" + +import argparse +import json +import sys +import os +from pathlib import Path +from typing import List, Tuple, Optional, Dict +import warnings +import copy + +# Suppress deprecation warnings from jsonschema +warnings.filterwarnings('ignore', category=DeprecationWarning) + +try: + import jsonschema + from jsonschema import RefResolver, Draft7Validator +except ImportError: + print("Error: jsonschema package not found. Install it with: pip install jsonschema") + sys.exit(1) + +# ANSI color codes +GREEN = '\033[92m' +RED = '\033[91m' +YELLOW = '\033[93m' +BLUE = '\033[94m' +RESET = '\033[0m' + + +def find_project_root() -> Path: + """Find the project root directory (contains src/schemas/).""" + # Start from the script location + current = Path(__file__).parent.resolve() + + # Go up to find the project root + while current != current.parent: + if (current / "src" / "schemas" / "main.json").exists(): + return current + current = current.parent + + raise FileNotFoundError("Could not find project root (looking for src/schemas/main.json)") + + +def find_dataflow_files(search_path: Path) -> List[Path]: + """ + Find all dataflow *_main.json files recursively. + + Args: + search_path: Directory or file path to search + + Returns: + List of Path objects for dataflow files + """ + if search_path.is_file(): + if search_path.name.endswith("_main.json"): + return [search_path] + else: + return [] + + return list(search_path.rglob("**/dataflows/**/dataflowspec/*_main.json")) + + +def load_dataflow_spec_mapping(project_root: Path, version: str) -> Optional[Dict]: + """ + Load the dataflow spec mapping for a specific version. + + Args: + project_root: Root directory of the project + version: Version string (e.g., "0.1.0") + + Returns: + Mapping dictionary or None if not found + """ + mapping_path = project_root / "src" / "config" / "dataflow_spec_mapping" / version / "dataflow_spec_mapping.json" + + if not mapping_path.exists(): + return None + + try: + with open(mapping_path) as f: + return json.load(f) + except Exception as e: + print(f"{YELLOW}Warning: Could not load mapping for version {version}: {e}{RESET}") + return None + + +def apply_rename_all(data: Dict, rename_map: Dict) -> None: + """ + Recursively rename all occurrences of keys in the data structure. + + Args: + data: Dictionary to modify in-place + rename_map: Map of old_key -> new_key + """ + if not isinstance(data, dict): + return + + # Create list of keys to avoid modifying dict during iteration + keys_to_rename = [] + for old_key in list(data.keys()): + if old_key in rename_map: + keys_to_rename.append((old_key, rename_map[old_key])) + + # Perform renames + for old_key, new_key in keys_to_rename: + data[new_key] = data.pop(old_key) + + # Recursively process nested structures + for value in data.values(): + if isinstance(value, dict): + apply_rename_all(value, rename_map) + elif isinstance(value, list): + for item in value: + if isinstance(item, dict): + apply_rename_all(item, rename_map) + + +def apply_rename_specific(data: Dict, rename_map: Dict) -> None: + """ + Rename specific keys at specific paths (e.g., "targetDetails.topic" -> "targetDetails.name"). + + Args: + data: Dictionary to modify in-place + rename_map: Map of path -> new_key_name + """ + for old_path, new_key in rename_map.items(): + path_parts = old_path.split('.') + + # Navigate to parent + current = data + for part in path_parts[:-1]: + if isinstance(current, dict) and part in current: + current = current[part] + else: + break + else: + # Rename the final key + old_key = path_parts[-1] + if isinstance(current, dict) and old_key in current: + current[new_key] = current.pop(old_key) + + +def apply_move(data: Dict, move_map: Dict) -> None: + """ + Move values from one path to another (e.g., "targetDetails.topic" -> "targetDetails.kafkaOptions.topic"). + + Args: + data: Dictionary to modify in-place + move_map: Map of source_path -> destination_path + """ + for src_path, dest_path in move_map.items(): + src_parts = src_path.split('.') + dest_parts = dest_path.split('.') + + # Get source value + current = data + for part in src_parts[:-1]: + if isinstance(current, dict) and part in current: + current = current[part] + else: + break + else: + src_key = src_parts[-1] + if isinstance(current, dict) and src_key in current: + value = current[src_key] + + # Navigate/create destination path + dest_current = data + for part in dest_parts[:-1]: + if part not in dest_current: + dest_current[part] = {} + dest_current = dest_current[part] + + # Set at destination + dest_key = dest_parts[-1] + dest_current[dest_key] = value + + # Remove from source + del current[src_key] + + +def apply_dataflow_spec_mapping(data: Dict, mapping: Dict) -> Dict: + """ + Apply dataflow spec mapping transformations to the data. + + Args: + data: Dataflow spec dictionary + mapping: Mapping configuration + + Returns: + Transformed dataflow spec (a deep copy) + """ + # Work on a copy to avoid modifying the original + transformed = copy.deepcopy(data) + + # Get global mappings + global_mapping = mapping.get("global", {}) + + # Get spec-type specific mappings + spec_type = data.get("dataFlowType", "").lower() + type_mapping = mapping.get(spec_type, {}) + + # Combine mappings (type-specific overrides global) + combined_mapping = {**global_mapping, **type_mapping} + + # Apply transformations in order + if "rename_all" in combined_mapping: + apply_rename_all(transformed, combined_mapping["rename_all"]) + + if "rename_specific" in combined_mapping: + apply_rename_specific(transformed, combined_mapping["rename_specific"]) + + if "move" in combined_mapping: + apply_move(transformed, combined_mapping["move"]) + + return transformed + + +def validate_file(file_path: Path, schema_path: Path, apply_mapping: bool = False, + project_root: Optional[Path] = None) -> Tuple[bool, List[str], Optional[str]]: + """ + Validate a single JSON file against the schema. + + Args: + file_path: Path to the JSON file to validate + schema_path: Path to the schema file + apply_mapping: Whether to apply version mapping before validation + project_root: Root directory of the project (required if apply_mapping is True) + + Returns: + Tuple of (is_valid, error_messages_list, version_applied) + """ + version_applied = None + + try: + with open(file_path) as f: + data = json.load(f) + + # Apply version mapping if requested + if apply_mapping and project_root: + version = data.get("dataFlowVersion") + if version: + mapping = load_dataflow_spec_mapping(project_root, version) + if mapping: + data = apply_dataflow_spec_mapping(data, mapping) + version_applied = version + + with open(schema_path) as f: + schema = json.load(f) + + # Create resolver for $ref references + schema_dir = os.path.dirname(os.path.abspath(schema_path)) + resolver = RefResolver(base_uri=f'file://{schema_dir}/', referrer=schema) + + # Create validator and collect all errors + validator = Draft7Validator(schema, resolver=resolver) + errors = list(validator.iter_errors(data)) + + if not errors: + return True, ["Valid"], version_applied + + # Format all error messages + error_messages = [] + for error in errors: + error_msg = f"{error.message}" + + # Handle "is not valid under any of the given schemas" or "should not be valid under" (validation failure) + if "is not valid under any of the given schemas" in error_msg or ("should not be valid under" in error_msg and "'datetimeFormat']}" in error_msg): + # Check if this is a versionType/datetimeFormat issue + try: + # Navigate to the error location + error_data = data + for path_part in error.path: + error_data = error_data[path_part] + + # Check if it's the historicalSnapshotFileSource anyOf issue + if isinstance(error_data, dict) and 'versionType' in error_data: + version_type = error_data.get('versionType') + has_datetime = 'datetimeFormat' in error_data + + if version_type == 'integer' and has_datetime: + error_msg = f"Property 'datetimeFormat' is not allowed when versionType is 'integer'" + elif version_type == 'timestamp' and not has_datetime: + error_msg = f"Property 'datetimeFormat' is required when versionType is 'timestamp'" + except: + pass # Fall back to original message + + # Handle "should not be valid under" errors (from not/anyOf constraints) + elif "should not be valid under" in error_msg and "anyOf" in error_msg: + # Extract which properties are disallowed from the schema + try: + # The schema might have a 'not' wrapper containing the anyOf + anyof_schema = error.schema.get('not', {}).get('anyOf', []) + if not anyof_schema: + anyof_schema = error.schema.get('anyOf', []) + + if anyof_schema: + disallowed_props = [] + for condition in anyof_schema: + if 'required' in condition: + disallowed_props.extend(condition['required']) + + # Check which of these properties are actually present in the data + present_props = [] + for prop in disallowed_props: + if prop in data: + present_props.append(prop) + + if present_props: + props_str = ', '.join(f"'{p}'" for p in present_props) + error_msg = f"Properties {props_str} are not allowed in this context" + else: + error_msg = f"One or more disallowed properties are present" + except: + pass # Fall back to original message + + if error.path: + path_str = '.'.join(str(p) for p in error.path) + if "at path:" not in error_msg: # Don't add path twice + error_msg += f" at path: {path_str}" + + error_messages.append(error_msg) + + return False, error_messages, version_applied + + except FileNotFoundError as e: + return False, [f"File not found: {e}"], version_applied + except json.JSONDecodeError as e: + return False, [f"Invalid JSON: {e}"], version_applied + except Exception as e: + return False, [f"Error: {e}"], version_applied + + +def main(): + parser = argparse.ArgumentParser( + description="Validate dataflow JSON files against schemas", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s # Validate all dataflow files + %(prog)s samples/bronze_sample/ # Validate specific directory + %(prog)s path/to/customer_main.json # Validate single file + """ + ) + parser.add_argument( + 'path', + nargs='?', + default='.', + help='Path to file or directory to validate (default: current directory, searches recursively)' + ) + parser.add_argument( + '-v', '--verbose', + action='store_true', + help='Show verbose output' + ) + parser.add_argument( + '--no-mapping', + action='store_true', + help='Skip version mapping transformations (validate strict against current schema without transforming legacy properties)' + ) + + args = parser.parse_args() + + # Resolve search path + search_path = Path(args.path).resolve() + + if not search_path.exists(): + print(f"{RED}Error: Path does not exist: {args.path}{RESET}") + return 1 + + # Find project root and schema + try: + project_root = find_project_root() + schema_path = project_root / "src" / "schemas" / "main.json" + except FileNotFoundError as e: + print(f"{RED}Error: {e}{RESET}") + return 1 + + # Apply mapping by default unless --no-mapping is specified + apply_mapping = not args.no_mapping + + if args.verbose: + print(f"{BLUE}Project root: {project_root}{RESET}") + print(f"{BLUE}Schema path: {schema_path}{RESET}") + print(f"{BLUE}Search path: {search_path}{RESET}") + print(f"{BLUE}Apply mapping: {apply_mapping}{RESET}\n") + + # Find files to validate + files = find_dataflow_files(search_path) + + if not files: + print(f"{YELLOW}No dataflow files found in: {args.path}{RESET}") + return 0 + + # Validate each file + if args.no_mapping: + mode_str = " (strict mode - no version mapping)" + else: + mode_str = " (with version mapping)" + print(f"Validating {len(files)} file(s){mode_str}...\n") + + passed = 0 + failed = 0 + mapped_count = 0 + + for file_path in sorted(files): + # Display relative path from project root if possible + try: + rel_path = file_path.relative_to(project_root) + except ValueError: + rel_path = file_path + + is_valid, error_messages, version_applied = validate_file( + file_path, schema_path, + apply_mapping=apply_mapping, + project_root=project_root + ) + + if is_valid: + version_str = f" {BLUE}[v{version_applied}]{RESET}" if version_applied else "" + print(f"{GREEN}✓{RESET} {rel_path}{version_str}") + passed += 1 + if version_applied: + mapped_count += 1 + else: + version_str = f" {BLUE}[v{version_applied}]{RESET}" if version_applied else "" + error_count = len(error_messages) + error_label = "error" if error_count == 1 else "errors" + print(f"{RED}✗{RESET} {rel_path}{version_str} {RED}({error_count} {error_label}){RESET}") + for i, message in enumerate(error_messages, 1): + print(f" {RED}{i}. {message}{RESET}") + failed += 1 + + # Summary + print(f"\n{'='*60}") + print(f"Total: {len(files)} | {GREEN}Passed: {passed}{RESET} | {RED}Failed: {failed}{RESET}") + if apply_mapping and mapped_count > 0: + print(f"Mapped: {BLUE}{mapped_count}{RESET} files had version mappings applied") + + return 1 if failed > 0 else 0 + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..e607798 Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..025753c --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,67 @@ +""" +Lakeflow Framework - A framework for building Delta Live Tables pipelines. + +This package provides tools and utilities for creating, managing, and orchestrating +Delta Live Tables (DLT) pipelines in Databricks. +""" + +from .dataflow import ( + DataFlow, + FlowConfig, + FlowGroup, + BaseFlow, + QuarantineManager, + TableMigrationManager, + TargetType, + SinkType, + View, + StagingTable +) +from .dataflow.dataflow_spec import DataflowSpec +from .dataflow_spec_builder import DataflowSpecBuilder, DataQualityExpectationBuilder +from .dlt_pipeline_builder import DLTPipelineBuilder +from .pipeline_config import ( + get_spark, + get_logger, + get_operational_metadata_schema, + get_pipeline_details, + get_substitution_manager, + get_mandatory_table_properties, + initialize_core, + initialize_operational_metadata_schema, + initialize_pipeline_details, + initialize_substitution_manager, + initialize_mandatory_table_properties +) +from .secrets_manager import SecretsManager +from .substitution_manager import SubstitutionManager +from .constants import SystemColumns, MetaDataColumnDefs +from . import utility + +__all__ = [ + 'DLTPipelineBuilder', + 'DataflowSpec', + 'DataflowSpecBuilder', + 'DataQualityExpectationBuilder', + + # Main classes + 'DataFlow', 'FlowConfig', 'FlowGroup', 'BaseFlow', 'StagingTable', 'View', + + # Managers + 'SecretsManager', 'SubstitutionManager', 'TableMigrationManager', 'QuarantineManager', + + # Types and Enums + 'TargetType', 'SinkType', + + # Constants + 'SystemColumns', 'MetaDataColumnDefs', + + # Utilities + 'utility', + + # Pipeline Config + 'get_spark', 'get_logger', 'get_operational_metadata_schema', 'get_pipeline_details', 'get_substitution_manager', 'get_mandatory_table_properties', + 'initialize_core', 'initialize_operational_metadata_schema', 'initialize_pipeline_details', 'initialize_substitution_manager', 'initialize_mandatory_table_properties' +] + +__version__ = '0.1.0' diff --git a/src/config/dataflow_spec_mapping/0.1.0/dataflow_spec_mapping.json b/src/config/dataflow_spec_mapping/0.1.0/dataflow_spec_mapping.json new file mode 100644 index 0000000..6b5ba0e --- /dev/null +++ b/src/config/dataflow_spec_mapping/0.1.0/dataflow_spec_mapping.json @@ -0,0 +1,16 @@ +{ + "global": { + "rename_all": { + "cdcApplyChanges": "cdcSettings", + "cdcApplyChangesFromSnapshot": "cdcSnapshotSettings" + }, + "rename_specific": { + "targetDetails.topic": "targetDetails.name", + "targetDetails.kafkaOptions": "targetDetails.sinkOptions" + + }, + "move": { + "targetDetails.topic": "targetDetails.kafkaOptions.topic" + } + } +} \ No newline at end of file diff --git a/src/config/dataflow_spec_mapping/0.2.0/dataflow_spec_mapping.json b/src/config/dataflow_spec_mapping/0.2.0/dataflow_spec_mapping.json new file mode 100644 index 0000000..2cecde2 --- /dev/null +++ b/src/config/dataflow_spec_mapping/0.2.0/dataflow_spec_mapping.json @@ -0,0 +1,25 @@ +{ + "global": { + "rename_all": { + "cdcApplyChanges": "cdcSettings", + "cdcApplyChangesFromSnapshot": "cdcSnapshotSettings", + "spark_conf": "sparkConf", + "pythonFunctionPath": "functionPath" + }, + "rename_specific": { + "targetDetails.topic": "targetDetails.name", + "targetDetails.kafkaOptions": "targetDetails.sinkOptions" + }, + "move": { + "targetDetails.topic": "targetDetails.kafkaOptions.topic", + "sourceDetails.pythonFunctionPath": { + "to": "sourceDetails.pythonTransform.functionPath", + "condition": { + "key": "sourceType", + "operator": "not_equal_to", + "value": "python" + } + } + } + } +} \ No newline at end of file diff --git a/src/config/global.json b/src/config/global.json new file mode 100644 index 0000000..731a12d --- /dev/null +++ b/src/config/global.json @@ -0,0 +1,16 @@ +{ + "pipeline_bundle_spec_format": { + "format": "json", + "allow_override": true + }, + "spark_config": { + "spark.databricks.sql.streamingTable.cdf.applyChanges.returnPhysicalCdf": true, + "pipelines.streamingFlowReadOptionsEnabled": true, + "pipelines.externalSink.enabled": true + }, + "mandatory_table_properties": { + "delta.logRetentionDuration": "interval 45 days", + "delta.deletedFileRetentionDuration": "interval 45 days", + "delta.enableRowTracking": "true" + } +} diff --git a/src/config/operational_metadata_bronze.json b/src/config/operational_metadata_bronze.json new file mode 100644 index 0000000..fdce258 --- /dev/null +++ b/src/config/operational_metadata_bronze.json @@ -0,0 +1,57 @@ +{ + "type": "struct", + "fields": [ + { + "name": "meta_load_details", + "type": { + "type": "struct", + "fields": [ + { + "name": "record_insert_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": { + "mapping": { + "type": "sql", + "sql": "current_timestamp()" + } + } + }, + { + "name": "record_update_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": { + "mapping": { + "type": "sql", + "sql": "current_timestamp()" + } + } + }, + { + "name": "pipeline_start_timestamp", + "type": "timestamp", + "nullable": true, + "metadata": { + "mapping": { + "type": "pipeline_detail", + "key": "start_utc_timestamp" + } + } + }, + { + "name": "pipeline_update_id", + "type": "string", + "nullable": true, + "metadata": { + "mapping": { + "type": "pipeline_detail", + "key": "update_id" + } + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/src/config/operational_metadata_gold.json b/src/config/operational_metadata_gold.json new file mode 100644 index 0000000..fdce258 --- /dev/null +++ b/src/config/operational_metadata_gold.json @@ -0,0 +1,57 @@ +{ + "type": "struct", + "fields": [ + { + "name": "meta_load_details", + "type": { + "type": "struct", + "fields": [ + { + "name": "record_insert_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": { + "mapping": { + "type": "sql", + "sql": "current_timestamp()" + } + } + }, + { + "name": "record_update_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": { + "mapping": { + "type": "sql", + "sql": "current_timestamp()" + } + } + }, + { + "name": "pipeline_start_timestamp", + "type": "timestamp", + "nullable": true, + "metadata": { + "mapping": { + "type": "pipeline_detail", + "key": "start_utc_timestamp" + } + } + }, + { + "name": "pipeline_update_id", + "type": "string", + "nullable": true, + "metadata": { + "mapping": { + "type": "pipeline_detail", + "key": "update_id" + } + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/src/config/operational_metadata_silver.json b/src/config/operational_metadata_silver.json new file mode 100644 index 0000000..fdce258 --- /dev/null +++ b/src/config/operational_metadata_silver.json @@ -0,0 +1,57 @@ +{ + "type": "struct", + "fields": [ + { + "name": "meta_load_details", + "type": { + "type": "struct", + "fields": [ + { + "name": "record_insert_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": { + "mapping": { + "type": "sql", + "sql": "current_timestamp()" + } + } + }, + { + "name": "record_update_timestamp", + "type": "timestamp", + "nullable": false, + "metadata": { + "mapping": { + "type": "sql", + "sql": "current_timestamp()" + } + } + }, + { + "name": "pipeline_start_timestamp", + "type": "timestamp", + "nullable": true, + "metadata": { + "mapping": { + "type": "pipeline_detail", + "key": "start_utc_timestamp" + } + } + }, + { + "name": "pipeline_update_id", + "type": "string", + "nullable": true, + "metadata": { + "mapping": { + "type": "pipeline_detail", + "key": "update_id" + } + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/src/constants.py b/src/constants.py new file mode 100644 index 0000000..2122a22 --- /dev/null +++ b/src/constants.py @@ -0,0 +1,164 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass(frozen=True) +class FrameworkSettings: + """ + FrameworkSettings is a class that contains constants for the framework settings. + """ + OVERRIDE_MAX_WORKERS_KEY: str = "override_max_workers" + PIPELINE_BUILDER_DISABLE_THREADING_KEY: str = "pipeline_builder_disable_threading" + + +@dataclass(frozen=True) +class DLTPipelineSettingKeys: + """ + DLTPipelineSettingKeys is a class that contains constants for various Pipeline settings keys. + """ + BUNDLE_SOURCE_PATH: str = "bundle.sourcePath" + BUNDLE_TARGET: str = "bundle.target" + FRAMEWORK_SOURCE_PATH: str = "framework.sourcePath" + LOG_LEVEL: str = "logLevel" + LOGICAL_ENV: str = "logicalEnv" + PIPELINE_CATALOG: str = "pipelines.catalog" + PIPELINE_FILE_FILTER: str = "pipeline.fileFilter" + PIPELINE_FILTER_DATA_FLOW_GROUP: str = "pipeline.dataFlowGroupFilter" + PIPELINE_FILTER_DATA_FLOW_ID: str = "pipeline.dataFlowIdFilter" + PIPELINE_FILTER_FLOW_GROUP_ID: str = "pipeline.flowGroupIdFilter" + PIPELINE_FILTER_TARGET_TABLE: str = "pipeline.targetTableFilter" + PIPELINE_ID: str = "pipelines.id" + PIPELINE_IGNORE_VALIDATION_ERRORS: str = "pipeline.ignoreValidationErrors" + PIPELINE_LAYER: str = "pipeline.layer" + PIPELINE_TARGET: str = "pipelines.target" + PIPELINE_SCHEMA: str = "pipelines.schema" + WORKSPACE_HOST: str = "workspace.host" + + +@dataclass(frozen=True) +class FrameworkPaths: + """ + FrameworkPaths is a class that contains constants for various paths and file masks used in the Lakeflow Framework. + + Attributes: + CONFIG_PATH (str): Path to the config directory. + EXTENSIONS_PATH (str): The path for extensions. + GLOBAL_CONFIG (tuple): Paths to the global configuration files. + GLOBAL_SUBSTITUTIONS (tuple): Paths to the global substitutions files. + GLOBAL_SECRETS (tuple): Paths to the global secrets files. + DATAFLOW_SPEC_MAPPING_PATH (str): Path to the dataflow spec mapping directory. + MAIN_SPEC_SCHEMA_PATH (str): Path to the main specification schema file. + FLOW_GROUP_SPEC_SCHEMA_PATH (str): Path to the flow group specification schema file. + EXPECTATIONS_SPEC_SCHEMA_PATH (str): Path to the expectations specification schema file. + SECRETS_SCHEMA_PATH (str): Path to the secrets specification schema file. + TEMPLATE_DEFINITION_SPEC_SCHEMA_PATH (str): Path to the template definition specification schema file. + TEMPLATE_SPEC_SCHEMA_PATH (str): Path to the template specification schema file. + """ + CONFIG_PATH: str = "./config" + EXTENSIONS_PATH: str = "./extensions" + GLOBAL_CONFIG: tuple = ("./config/global.json", "./config/global.yaml", "./config/global.yml") + GLOBAL_SUBSTITUTIONS: tuple = ("_substitutions.json", "_substitutions.yaml", "_substitutions.yml") + GLOBAL_SECRETS: tuple = ("_secrets.json", "_secrets.yaml", "_secrets.yml") + DATAFLOW_SPEC_MAPPING_PATH: str = "./config/dataflow_spec_mapping" + REQUIREMENTS_FILE: str = "requirements.txt" + + # Spec schema definitions paths + SPEC_MAPPING_SCHEMA_PATH: str = "./schemas/spec_mapping.json" + MAIN_SPEC_SCHEMA_PATH: str = "./schemas/main.json" + FLOW_GROUP_SPEC_SCHEMA_PATH: str = "./schemas/flow_group.json" + EXPECTATIONS_SPEC_SCHEMA_PATH: str = "./schemas/expectations.json" + SECRETS_SCHEMA_PATH: str = "./schemas/secrets.json" + TEMPLATE_DEFINITION_SPEC_SCHEMA_PATH: str = "./schemas/spec_template_definition.json" + TEMPLATE_SPEC_SCHEMA_PATH: str = "./schemas/spec_template.json" + + +class SupportedSpecFormat(str, Enum): + """Supported specification file formats.""" + JSON = "json" + YAML = "yaml" + + +@dataclass(frozen=True) +class PipelineBundleSuffixesJson: + """ + PipelineBundleSuffixesJson is a class that contains constants for various file suffixes used in the Pipeline Bundles in JSON format. + """ + MAIN_SPEC_FILE_SUFFIX: tuple = ("_main.json") + FLOW_GROUP_FILE_SUFFIX: tuple = ("_flow.json") + EXPECTATIONS_FILE_SUFFIX: tuple = (".json") + SECRETS_FILE_SUFFIX: tuple = ("_secrets.json") + SUBSTITUTIONS_FILE_SUFFIX: tuple = ("_substitutions.json") + + +@dataclass(frozen=True) +class PipelineBundleSuffixesYaml: + """ + PipelineBundleSuffixesYaml is a class that contains constants for various file suffixes used in the Pipeline Bundles in YAML format. + """ + MAIN_SPEC_FILE_SUFFIX: tuple = ("_main.yaml", "_main.yml") + FLOW_GROUP_FILE_SUFFIX: tuple = ("_flow.yaml", "_flow.yml") + EXPECTATIONS_FILE_SUFFIX: tuple = ("_expectations.yaml", "_expectations.yml") + SECRETS_FILE_SUFFIX: tuple = ("_secrets.yaml", "_secrets.yml") + SUBSTITUTIONS_FILE_SUFFIX: tuple = ("_substitutions.yaml", "_substitutions.yml") + + +@dataclass(frozen=True) +class PipelineBundlePaths: + """ + PipelineBundlePaths is a class that contains constants for various paths and file masks + used in the Pipeline Bundles. + + Attributes: + DATAFLOWS_BASE_PATH (str): The base path for dataflows. + DATAFLOW_SPEC_PATH (str): The path for dataflow specifications. + DML_PATH (str): The path for DML (Data Manipulation Language) files. + DQE_PATH (str): The path for data quality expectations. + EXTENSIONS_PATH (str): The path for extensions. + GLOBAL_CONFIG_FILE (tuple): The file names for global configuration files. + PIPELINE_CONFIGS_PATH (str): The path for pipeline configuration files. + PYTHON_FUNCTION_PATH (str): The path for python functions. + SCHEMA_PATH (str): The path for schema files. + TEMPLATE_PATH (str): Path to the template directory. + """ + DATAFLOWS_BASE_PATH: str = "./dataflows" + DATAFLOW_SPEC_PATH: str = "dataflowspec" + DML_PATH: str = "./dml" + DQE_PATH: str = "./expectations" + EXTENSIONS_PATH: str = "./extensions" + GLOBAL_CONFIG_FILE: tuple = ("./global.json", "./global.yaml", "./global.yml") + PIPELINE_CONFIGS_PATH: str = "./pipeline_configs" + PYTHON_FUNCTION_PATH: str = "./python_functions" + SCHEMA_PATH: str = "./schemas" + TEMPLATE_PATH: str = "./templates" + REQUIREMENTS_FILE: str = "requirements.txt" + + +class SystemColumns: + """ + SystemColumns is a container for constants related to SDP and Framework system columns. + + Classes: + CDFColumns (Enum): Contains constants for Change Data Feed (CDF) system columns. + SCD2Columns (Enum): Contains constants for Slowly Changing Dimension Type 2 (SCD2) system columns. + """ + class CDFColumns(Enum): + """Change Data Feed system columns.""" + CDF_CHANGE_TYPE = "_change_type" + CDF_COMMIT_VERSION = "_commit_version" + CDF_COMMIT_TIMESTAMP = "_commit_timestamp" + + class SCD2Columns(Enum): + """SCD2 system columns.""" + SCD2_START_AT = "__START_AT" + SCD2_END_AT = "__END_AT" + +class MetaDataColumnDefs: + """MetaDataColumnDefs is a class that contains constants for all Framework metadata columns.""" + + QUARANTINE_FLAG = { + "name": "is_quarantined", + "type": "boolean", + "nullable": True, + "metadata": {} + } + +FILE_METADATA_COLUMN = "_metadata" diff --git a/src/dataflow/__init__.py b/src/dataflow/__init__.py new file mode 100644 index 0000000..ffa495f --- /dev/null +++ b/src/dataflow/__init__.py @@ -0,0 +1,45 @@ +# Import enums +from .enums import Mode, SinkType, QuarantineMode, SourceType, TargetType + +# Import core classes +from .cdc import CDCFlow, CDCSettings +from .cdc_snaphot import CDCSnapshotFlow, CDCSnapshotSettings, CDCSnapshotTypes, CDCSnapshotVersionTypes +from .dataflow_spec import DataflowSpec +from .dataflow import DataFlow +from .expectations import DataQualityExpectations, ExpectationType +from .flow_group import FlowGroup +from .quarantine import QuarantineManager +from .targets.staging_table import StagingTable +from .table_import import create_table_import_flow +from .table_migration import TableMigrationDetails, TableMigrationManager +from .view import View + + +# Import all from sub-packages +from . import flows +from . import sources +from . import targets + +from .flows import * +from .sources import * +from .targets import * + +__all__ = ( + flows.__all__ + + sources.__all__ + + targets.__all__ + + [ + # Enums + 'Mode', 'SinkType', 'QuarantineMode', 'SourceType', 'TargetType', + + # Core classes + 'DataFlow', 'DataflowSpec', 'FlowGroup', 'View', 'StagingTable', + + # Feature-specific classes + 'CDCFlow', 'CDCSettings', 'CDCSnapshotFlow', 'CDCSnapshotSettings', 'CDCSnapshotTypes', 'CDCSnapshotVersionTypes', + 'DataQualityExpectations', 'ExpectationType', + 'TableMigrationDetails', 'TableMigrationManager', + 'QuarantineManager', + 'create_table_import_flow', + ] +) diff --git a/src/dataflow/cdc.py b/src/dataflow/cdc.py new file mode 100644 index 0000000..7515db8 --- /dev/null +++ b/src/dataflow/cdc.py @@ -0,0 +1,123 @@ +from dataclasses import dataclass, field +from typing import List, Optional + +from pyspark import pipelines as dp +from pyspark.sql import functions as F +import pyspark.sql.types as T + +import pipeline_config + + +@dataclass +class CDCSettings: + """ + CDC Settings for the SDP auto CDC API. + + Attributes: + keys (List): List of keys. + sequence_by (str): Sequence by column. + scd_type (str): SCD type. + where (str, optional): Where clause. + ignore_null_updates (bool, optional): Ignore null updates flag. + apply_as_deletes (str, optional): Apply as deletes flag. + apply_as_truncates (str, optional): Apply as truncates flag. + column_list (List, optional): List of columns. + except_column_list (List, optional): List of columns to exclude. + track_history_column_list (List, optional): List of columns to track history. + track_history_except_column_list (List, optional): List of columns to exclude from history tracking. + """ + keys: List + sequence_by: str + scd_type: str + where: str = None + ignore_null_updates: bool = False + apply_as_deletes: str = None + apply_as_truncates: str = None + column_list: List = field(default_factory=list) + except_column_list: List = field(default_factory=list) + track_history_column_list: List = field(default_factory=list) + track_history_except_column_list: List = field(default_factory=list) + sequence_by_data_type: T.DataType = None + + def __post_init__(self): + if self.scd_type == "2": + # TODO: implement dynamic sequence by type + self.sequence_by_data_type = T.TimestampType() + + +class CDCFlow: + """ + A class to create a CDC flow. + """ + def __init__(self, settings: CDCSettings): + + self.settings = settings + self.apply_as_deletes = self.settings.apply_as_deletes + self.apply_as_truncates = self.settings.apply_as_truncates + self.column_list = self.settings.column_list + self.except_column_list = self.settings.except_column_list + self.keys = self.settings.keys + self.sequence_by = self.settings.sequence_by + self.scd_type = self.settings.scd_type + self.track_history_column_list = self.settings.track_history_column_list + self.track_history_except_column_list = self.settings.track_history_except_column_list + self.where = self.settings.where + self.ignore_null_updates = self.settings.ignore_null_updates + + def create( + self, + target_table: str, + source_view_name: str, + flow_name: Optional[str] = None, + additional_except_columns: Optional[List[str]] = None, + run_once: bool = False + ) -> None: + """Create CDC flow. + + Args: + logger: Logger instance for logging operations + target_table: Name of the target table + source_view_name: Name of the source view + flow_name: Optional name for the flow + additional_except_columns: Additional columns to exclude + run_once: Whether to run the flow only once + """ + logger = pipeline_config.get_logger() + logger.debug("CDC API: passed cdc_settings: %s", self.settings) + + additional_except_columns = additional_except_columns or [] + logger.debug("CDC API: passed additional_except_columns: %s", additional_except_columns) + + # Handle apply_as_deletes expression + apply_as_deletes = F.expr(self.apply_as_deletes) if self.apply_as_deletes else None + logger.debug("CDC API: apply_as_deletes: %s", apply_as_deletes) + + # Handle apply_as_truncates expression + apply_as_truncates = F.expr(self.apply_as_truncates) if self.apply_as_truncates else None + logger.debug("CDC API: apply_as_truncates: %s", apply_as_truncates) + + # Handle except columns + except_column_list = self.except_column_list.copy() if self.except_column_list else [] + if additional_except_columns: + except_column_list.extend(additional_except_columns) + # CDCAPI throws error on empty list, so set to None if list empty + except_column_list = except_column_list if except_column_list else None + logger.debug("CDC API: except_column_list: %s", except_column_list) + + dp.create_auto_cdc_flow( + flow_name=flow_name, + target=target_table, + once=run_once, + source=source_view_name, + keys=self.keys, + sequence_by=self.sequence_by, + where=self.where.strip() if self.where and self.where.strip() else None, + ignore_null_updates=self.ignore_null_updates, + apply_as_deletes=apply_as_deletes, + apply_as_truncates=apply_as_truncates, + column_list=self.column_list, + except_column_list=except_column_list, + stored_as_scd_type=self.scd_type, + track_history_column_list=self.track_history_column_list, + track_history_except_column_list=self.track_history_except_column_list + ) diff --git a/src/dataflow/cdc_snaphot.py b/src/dataflow/cdc_snaphot.py new file mode 100644 index 0000000..55a75c7 --- /dev/null +++ b/src/dataflow/cdc_snaphot.py @@ -0,0 +1,521 @@ +import bisect +from dataclasses import dataclass, field +from datetime import datetime +import fnmatch +import re +from typing import Dict, List, Optional, Union + +from pyspark import pipelines as dp +from pyspark.sql import DataFrame +import pyspark.sql.types as T + +import pipeline_config + +from .dataflow_config import DataFlowConfig +from .sources import SourceDelta, SourceBatchFiles, ReadConfig + + +@dataclass(frozen=True) +class CDCSnapshotTypes: + """Constants for the types of CDC Snapshot.""" + HISTORICAL = "historical" + PERIODIC = "periodic" + + +@dataclass(frozen=True) +class CDCSnapshotSourceTypes: + """Constants for the types of CDC Snapshot source types.""" + FILE = "file" + TABLE = "table" + + +@dataclass(frozen=True) +class CDCSnapshotVersionTypes: + """Constants for the types of CDC Snapshot version types.""" + DATE = "date" + INTEGER = "integer" + LONG = "long" + TIMESTAMP = "timestamp" + + +@dataclass +class VersionInfo: + """A structure to hold version information with both raw and formatted values.""" + raw_value: Union[str, int, datetime] + version_type: str + datetime_format: Optional[str] = None + micro_second_mask_length: Optional[int] = None + + @property + def formatted_value(self) -> str: + """Get formatted value based on version type and datetime format.""" + if self.version_type == CDCSnapshotVersionTypes.TIMESTAMP: + if isinstance(self.raw_value, datetime): + if self.datetime_format: + if '%f' in self.datetime_format and self.micro_second_mask_length: + truncate_from_right = 6 - self.micro_second_mask_length + return self.raw_value.strftime(self.datetime_format)[:-truncate_from_right] + else: + return self.raw_value.strftime(self.datetime_format) + else: + return self.raw_value.strftime('%Y-%m-%d %H:%M:%S') + else: + return str(self.raw_value) + else: + return str(self.raw_value) + + @property + def sql_formatted_value(self) -> str: + """Get SQL formatted value with appropriate quotes.""" + if self.version_type in [CDCSnapshotVersionTypes.TIMESTAMP, CDCSnapshotVersionTypes.DATE]: + return f"'{self.formatted_value}'" + elif self.version_type in [CDCSnapshotVersionTypes.INTEGER, CDCSnapshotVersionTypes.LONG]: + return f"'{self.formatted_value}'" + else: + raise ValueError(f"Unsupported version type: {self.version_type}") + +@dataclass +class FilePathInfo: + """A structure to hold file path information.""" + full_path: str + filename_with_version_path: str + +@dataclass +class CDCSnapshotFileSource: + """A structure to hold the source configuration for CDC Snapshot.""" + format: str + path: str + readerOptions: Dict = field(default_factory=dict) + filter: Optional[str] = None + versionType: Optional[str] = None + startingVersion: Optional[Union[int, str]] = None + datetimeFormat: Optional[str] = None + microSecondMaskLength: Optional[int] = None + schemaPath: Optional[str] = None + selectExp: Optional[List[str]] = None + recursiveFileLookup: bool = False + + +@dataclass +class CDCSnapshotTableSource: + """A structure to hold the source configuration for CDC Snapshot.""" + table: str + versionColumn: str + versionType: str + startingVersion: Optional[Union[int, str]] = None + selectExp: Optional[List[str]] = None + + +@dataclass +class CDCSnapshotSettings: + """CDC Settings for the SDP auto CDC Snapshot API.""" + keys: List[str] + scd_type: str + snapshotType: str + sourceType: str = None + source: Dict = field(default_factory=dict) + track_history_column_list: Optional[List[str]] = None + track_history_except_column_list: Optional[List[str]] = None + sequence_by_data_type: T.DataType = None + + def __post_init__(self): + if self.snapshotType == CDCSnapshotTypes.HISTORICAL and not self.source: + raise ValueError("Source is required for Historical CDC from Snapshot") + + if self.scd_type == "2": + # TODO: implement dynamic sequence by type + self.sequence_by_data_type = T.TimestampType() + + if self.snapshotType == CDCSnapshotTypes.HISTORICAL: + version_type = self.get_source().versionType + if version_type == CDCSnapshotVersionTypes.INTEGER: + self.sequence_by_data_type = T.IntegerType() + + def get_source(self) -> Optional[CDCSnapshotFileSource]: + """Get source configuration for CDC from Snapshot.""" + if self.sourceType == CDCSnapshotSourceTypes.FILE: + return CDCSnapshotFileSource(**self.source) + elif self.sourceType == CDCSnapshotSourceTypes.TABLE: + return CDCSnapshotTableSource(**self.source) + else: + raise ValueError(f"Unsupported source type: {self.sourceType}") + + def is_historical(self) -> bool: + """Is the CDC snapshot type historical.""" + return self.snapshotType == CDCSnapshotTypes.HISTORICAL + + def is_file_source(self) -> bool: + """Is the CDC snapshot source type file.""" + return self.sourceType == CDCSnapshotSourceTypes.FILE + + +class CDCSnapshotFlow: + """A class to create a CDC Snapshot flow.""" + + def __init__(self, settings: CDCSnapshotSettings): + self.settings = settings + self.logger = pipeline_config.get_logger() + + # Core CDC settings + self.keys = settings.keys + self.scd_type = settings.scd_type + self.snapshotType = settings.snapshotType + self.track_history_column_list = settings.track_history_column_list + self.track_history_except_column_list = settings.track_history_except_column_list + self.sequence_by_data_type = settings.sequence_by_data_type + + # Historical snapshot specific settings + self.sourceType = None + self.source = None + if self.snapshotType == CDCSnapshotTypes.HISTORICAL: + self.sourceType = settings.sourceType + self.source = settings.get_source() + + if self.source is None: + raise ValueError("Source configuration is required for historical snapshots") + + # Cached version data + self._available_versions: Optional[List[VersionInfo]] = None + self._sorted_versions: Optional[List[VersionInfo]] = None + self._version_values: Optional[List[Union[int, datetime]]] = None + + @property + def sorted_versions(self) -> List[VersionInfo]: + """Get sorted versions.""" + if self._sorted_versions is None and self._available_versions: + self._sorted_versions = sorted(self._available_versions, key=lambda x: x.raw_value) + return self._sorted_versions or [] + + @property + def version_values(self) -> List[Union[int, datetime]]: + """Get version values.""" + if self._version_values is None: + self._version_values = [v.raw_value for v in self.sorted_versions] + return self._version_values + + def create( + self, + dataflow_config: DataFlowConfig, + target_table: str, + source_view_name: Optional[str] = None, + target_config_flags: Optional[List[str]] = None, + flow_name: Optional[str] = None # TODO: Add flow name + ) -> None: + """Create CDC from snapshot flow. + + Args: + dataflow_config: DataFlow configuration + target_table: Name of the target table + source_view_name: Name of the source view + flow_name: Optional name for the flow + """ + self.logger.debug(f"CDC From Snapshot: {self.source}") + + try: + if self.snapshotType == CDCSnapshotTypes.PERIODIC: + self._apply_periodic_changes(target_table, source_view_name) + elif self.snapshotType == CDCSnapshotTypes.HISTORICAL: + self._apply_historical_changes(target_table, dataflow_config, target_config_flags) + else: + raise ValueError(f"Unsupported snapshot type: {self.snapshotType}") + except Exception as e: + self.logger.error(f"Failed to create CDC snapshot flow: {e}") + raise + + def _apply_periodic_changes(self, target_table: str, source_view_name: str) -> None: + """Apply periodic changes from snapshot.""" + dp.create_auto_cdc_from_snapshot_flow( + target=target_table, + source=source_view_name, + keys=self.keys, + stored_as_scd_type=self.scd_type, + track_history_column_list=self.track_history_column_list, + track_history_except_column_list=self.track_history_except_column_list + ) + + def _apply_historical_changes(self, target_table: str, dataflow_config: DataFlowConfig, target_config_flags: Optional[List[str]] = None): + """Apply historical changes from snapshot.""" + dp.create_auto_cdc_from_snapshot_flow( + target=target_table, + snapshot_and_version=lambda version: self._next_snapshot_and_version(version, dataflow_config, target_config_flags), + keys=self.keys, + stored_as_scd_type=self.scd_type, + track_history_column_list=self.track_history_column_list, + track_history_except_column_list=self.track_history_except_column_list + ) + + def _next_snapshot_and_version(self, latest_snapshot_version, dataflow_config: DataFlowConfig, target_config_flags: Optional[List[str]] = None): + """Get the next snapshot and version.""" + try: + if self._available_versions is None: + self._available_versions = self._get_available_versions(latest_snapshot_version) + + if not self._available_versions: + self.logger.warning("CDC Snapshot: No valid versions found") + return None + + version_info = self._get_next_version(latest_snapshot_version) + if version_info is None: + self.logger.debug("CDC Snapshot: Retrieving next version was None") + return None + + self.logger.info(f"CDC Snapshot: Reading file version: {version_info.formatted_value}") + df = self._read_snapshot_dataframe(version_info, dataflow_config, target_config_flags) + if df is None or df.isEmpty(): + self.logger.debug("CDC Snapshot: Retrieving snapshot dataframe was None or empty") + return None + + self.logger.info(f"CDC Snapshot: Returning dataframe with version: {version_info.formatted_value}. Raw version: {version_info.raw_value}.") + return (df, version_info.raw_value) + + except Exception as e: + self.logger.error(f"CDC Snapshot: Error processing snapshots: {e}") + if isinstance(e, ValueError): + raise e + return None + + def _get_available_versions(self, latest_snapshot_version: Optional[Union[int, datetime]]) -> List[VersionInfo]: + """Get list of available versions from source.""" + if self.sourceType == CDCSnapshotSourceTypes.FILE: + return self._get_available_file_versions(latest_snapshot_version) + elif self.sourceType == CDCSnapshotSourceTypes.TABLE: + return self._get_available_table_versions(latest_snapshot_version) + else: + raise ValueError(f"Unsupported source type: {self.sourceType}") + + def _list_files(self, path, recursive=True): + """List files in a directory, with optional recursive file lookup. + + Args: + path: Directory path to list files from + recursive: If True, list files recursively. If False, list only files in the immediate directory. + + Returns: + List of file objects from dbutils.fs.ls() + """ + dbutils = pipeline_config.get_dbutils() + all_files = [] + + if recursive: + # Recursive file lookup + for f in dbutils.fs().ls(path): + if f.isDir(): + all_files.extend(self._list_files(f.path, recursive=True)) + else: + all_files.append(f) + else: + # Non-recursive file lookup - only immediate files and directories + for f in dbutils.fs().ls(path): + if not f.isDir(): + all_files.append(f) + + return all_files + + def _get_available_file_versions(self, latest_snapshot_version: Optional[Union[int, datetime]]) -> List[VersionInfo]: + """Get list of available versions from file path.""" + version_path_parts = [part for part in self.source.path.split('/') if '{version}' in part] + if not version_path_parts: + raise ValueError("No {version} found in path") + version_part_idx = self.source.path.split('/').index(version_path_parts[0]) + parent_dir = '/'.join(self.source.path.split('/')[:version_part_idx]) + file_pattern = '/'.join(self.source.path.split('/')[version_part_idx:]) + + self.logger.debug(f"CDC Snapshot: Listing files in {parent_dir} with pattern {file_pattern}") + + # List files using the configured recursive file lookup option + recursive_file_lookup = self.source.recursiveFileLookup + self.logger.debug(f"CDC Snapshot: Using recursive file lookup: {recursive_file_lookup}") + if recursive_file_lookup: + last_segment = file_pattern.split('/')[-1] + if '{version}' in last_segment: + raise ValueError( + f"CDC Snapshot: Recursive file lookup was enabled but the path '{file_pattern}' does not cater for recursive lookup. " + "Please update the path format to cater for recursive lookup. See documentation for details." + ) + files_list = self._list_files(parent_dir, recursive=recursive_file_lookup) + files_with_path_info = [FilePathInfo(full_path=f.path, filename_with_version_path='/'.join(f.path.split('/')[version_part_idx:])) for f in files_list] + + self.logger.debug(f"CDC Snapshot: Found {len(files_with_path_info)}") + + # Extract version from filename and filter by latest_snapshot_version if provided + available_versions = [] + for file in files_with_path_info: + self.logger.debug(f"CDC Snapshot: Processing file: {file.filename_with_version_path}") + try: + version_info = self._extract_version_from_filename(file.filename_with_version_path, file_pattern) + if version_info is None: + continue + + self.logger.debug(f"CDC Snapshot: Extracted version from filename: {version_info.formatted_value}. Raw version: {version_info.raw_value}") + + if latest_snapshot_version is None and self.source.startingVersion is not None and version_info.raw_value < self.source.startingVersion: + continue + + if latest_snapshot_version is not None and version_info.raw_value <= latest_snapshot_version: + continue + + available_versions.append(version_info) + self.logger.debug(f"CDC Snapshot: Added version {version_info.formatted_value} to available versions") + + except ValueError as e: + self.logger.warning(f"CDC Snapshot: Skipping file '{file.filename_with_version_path}' - {e}") + continue + + return available_versions + + def _get_available_table_versions(self, latest_snapshot_version: Optional[Union[int, datetime]]) -> List[VersionInfo]: + """Get list of available versions from table.""" + spark = pipeline_config.get_spark() + table_name = self.source.table + + self.logger.info(f"CDC Snapshot: Getting versions from table: {table_name}") + try: + df = spark.table(table_name) + except Exception as e: + self.logger.error(f"CDC Snapshot: Error getting versions from table: {e}") + raise + + # Get the version column + version_column = self.source.versionColumn + + # Check if the version column is a valid data type + valid_data_types = ["timestamp", "date", "integer","long"] + version_column_type = df.schema[version_column].dataType.typeName() + if version_column_type not in valid_data_types: + raise ValueError(f"Version column: {version_column}, type: {version_column_type}, is not a valid data type: {valid_data_types}") + if version_column_type != self.source.versionType: + raise ValueError(f"Version column: {version_column}, type: {version_column_type}, does not match specified version type: {self.source.versionType}") + + # Get the version values and filter by latest_snapshot_version if provided + if latest_snapshot_version is not None: + latest_version_info = VersionInfo( + raw_value=latest_snapshot_version, + version_type=self.source.versionType) + version_df = df.select(version_column).where(f"{version_column} > {latest_version_info.sql_formatted_value}").distinct() + else: + version_df = df.select(version_column).distinct() + + available_versions = [] + for row in version_df.collect(): + version = row[version_column] + + if version is None: + continue + + if self.source.startingVersion is not None and version < self.source.startingVersion: + self.logger.debug(f"CDC Snapshot: Skipping version {version} because it is less than the starting version {self.source.startingVersion}") + continue + + if latest_snapshot_version is not None and version <= latest_snapshot_version: + self.logger.debug(f"CDC Snapshot: Skipping version {version} because it is less than or equal to the latest snapshot version {latest_snapshot_version}") + continue + + version_info = VersionInfo( + raw_value=version, + version_type=self.source.versionType, + datetime_format=None + ) + + available_versions.append(version_info) + + self.logger.debug(f"CDC Snapshot: Found {len(available_versions)} available versions") + + return available_versions + + def _extract_version_from_filename(self, filename: str, file_pattern: str) -> Optional[VersionInfo]: + """Extract version from filename using pattern""" + regex_pattern = re.escape(file_pattern).replace(r'\{version\}', r'(.+)') + match = re.match(regex_pattern, filename) + if not match or not match.group(1): + self.logger.debug(f"CDC Snapshot: No version string match found for filename: {filename}") + self.logger.debug(f"CDC Snapshot: Regex pattern: {regex_pattern}") + return None + + version_str = match.group(1) + self.logger.debug(f"CDC Snapshot: Version string match found: {version_str}") + + try: + if self.source.versionType == CDCSnapshotVersionTypes.TIMESTAMP: + raw_value = datetime.strptime(version_str, self.source.datetimeFormat) + else: + raw_value = int(version_str) + + return VersionInfo( + raw_value=raw_value, + version_type=self.source.versionType, + datetime_format=self.source.datetimeFormat if self.source.versionType == CDCSnapshotVersionTypes.TIMESTAMP else None, + micro_second_mask_length=self.source.microSecondMaskLength \ + if self.source.versionType == CDCSnapshotVersionTypes.TIMESTAMP and self.source.microSecondMaskLength else None + ) + except (ValueError, TypeError) as e: + self.logger.error(f"CDC Snapshot: Failed to parse version '{version_str}': {e}") + raise + + def _get_next_version(self, latest_snapshot_version: Optional[Union[int, datetime]]) -> Optional[VersionInfo]: + """Get the next version to process.""" + # If no previous version exists yet + if latest_snapshot_version is None: + if not self.sorted_versions: + return None + version_info = self.sorted_versions[0] + self.logger.debug(f"CDC Snapshot: Using initial version: {version_info.formatted_value}") + return version_info + + # If a previous version exists, + # use bisect to find the first version greater than latest_snapshot_version + index = bisect.bisect_right(self.version_values, latest_snapshot_version) + if index < len(self.sorted_versions): + version_info = self.sorted_versions[index] + self.logger.debug(f"CDC Snapshot: Using next version: {version_info.formatted_value}") + return version_info + else: + self.logger.debug("CDC Snapshot: No more versions available") + return None + + def _read_snapshot_dataframe(self, version_info: VersionInfo, dataflow_config: DataFlowConfig, target_config_flags: Optional[List[str]] = None) -> Optional[DataFrame]: + """Read snapshot data into dataframe.""" + read_config = ReadConfig( + features=dataflow_config.features, + mode="batch", + target_config_flags=target_config_flags + ) + + if self.sourceType == CDCSnapshotSourceTypes.FILE: + file_path = self.source.path.replace("{version}", version_info.formatted_value) + self.logger.debug(f"CDC Snapshot: Reading file: {file_path}") + + schema_path = self.source.schemaPath + select_exp = self.source.selectExp + + df = SourceBatchFiles( + path=file_path, + format=self.source.format, + readerOptions=self.source.readerOptions, + schemaPath=schema_path, + selectExp=select_exp + ).read_source(read_config) + + # Apply filter if specified + if self.source.filter: + df = df.where(self.source.filter.replace("{version}", version_info.formatted_value)) + + elif self.sourceType == CDCSnapshotSourceTypes.TABLE: + table_parts = self.source.table.split(".") + if len(table_parts) < 2: + raise ValueError(f"Invalid table name format: {self.source.table}. Expected format: database.schema.table") + + table = table_parts[-1] + database = f"{table_parts[0]}.{table_parts[1]}" + select_exp = self.source.selectExp + where_clause = [ + f"{self.source.versionColumn} = {version_info.sql_formatted_value}"] + + self.logger.info(f"CDC Snapshot: Reading table: {database}.{table} with where clause: {where_clause}") + df = SourceDelta( + database=database, + table=table, + whereClause=where_clause, + selectExp=select_exp + ).read_source(read_config) + + return df diff --git a/src/dataflow/dataflow.py b/src/dataflow/dataflow.py new file mode 100644 index 0000000..cf1315b --- /dev/null +++ b/src/dataflow/dataflow.py @@ -0,0 +1,442 @@ +from typing import Dict, List + +import pyspark.sql.types as T + +from constants import SystemColumns +import pipeline_config + +from .cdc_snaphot import CDCSnapshotFlow +from .dataflow_config import DataFlowConfig +from .dataflow_spec import DataflowSpec +from .enums import QuarantineMode, SinkType, TargetType, TableType +from .flows.base import BaseFlow, BaseFlowWithViews, FlowConfig +from .flow_group import FlowGroup +from .table_migration import TableMigrationManager +from .quarantine import QuarantineManager +from .targets import StagingTable +from .view import View, ViewConfig + + +class DataFlow: + """ + manage and orchestrate data pipelines. + + Attributes: + dataflow_spec (DataflowSpec): The dataflow specification. + + # general attributes + flow_groups (List[FlowGroup]): List of flow groups. + local_path (str): The local path. + pipeline_catalog (str): The pipeline catalog. + pipeline_schema (str): The pipeline schema. + uc_enabled (bool): Whether Unity Catalog is enabled. + + # target details + target_details (Dict): The target details. + target_database (str): The target database. + + # CDC settings + cdc_settings (CDCSettings): The CDC settings. + cdc_snapshot_settings (CDCSnapshotSettings): The CDC snapshot settings. + + # expectations + expectations_enabled (bool): Whether expectations are enabled. + expectations (DataQualityExpectations): The data quality expectations. + expectations_clause (Dict): The expectations clause for the SDP create table api. + + # quarantine settings + quarantine_enabled (bool): Whether quarantine is enabled. + quarantine_manager (QuarantineManager): The quarantine manager. + quarantine_mode (str): The quarantine mode. + + # table migration settings + table_migration_manager (TableMigrationManager): The table migration manager. + table_migration_enabled (bool): Whether table migration is enabled. + + # features + features (Dict): The features. + + Methods: + create_dataflow(): + Creates the data flow based on the specifications. + """ + + CDF_COLUMN_NAMES = [column.value for column in SystemColumns.CDFColumns] + SCD2_COLUMN_NAMES = [column.value for column in SystemColumns.SCD2Columns] + + def __init__( + self, + dataflow_spec: DataflowSpec + ): + self.dataflow_spec = dataflow_spec + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.pipeline_details = pipeline_config.get_pipeline_details() + + self.pipeline_catalog = self.pipeline_details.pipeline_catalog + self.pipeline_schema = self.pipeline_details.pipeline_schema + + self.flow_groups = self.dataflow_spec.get_flow_groups() + self.local_path = self.dataflow_spec.localPath + self.features = self.dataflow_spec.get_features() + self.uc_enabled = self.spark.conf.get("spark.databricks.unityCatalog.enabled", "false").lower() == "true" + self.table_migration_enabled = False + + self.dataflow_config = DataFlowConfig( + features=self.features, + uc_enabled=self.uc_enabled + ) + + self._init_target_details() + self._init_cdc_settings() + self._init_expectations() + self._init_quarantine() + self._init_table_migration() + + def _init_target_details(self): + """init target details from the dataflow specification.""" + self.target_details = self.dataflow_spec.get_target_details() + self.target_database = ( + self.target_details.database + if hasattr(self.target_details, 'database') and self.target_details.database + else f"{self.pipeline_catalog}.{self.pipeline_schema}" + ) + + log_target = f"target type: {self.dataflow_spec.targetFormat}, target: " + ( + getattr(self.target_details, 'sink_name') + if self.dataflow_spec.targetFormat in SinkType.__dict__.values() + else getattr(self.target_details, 'table') + ) + self.logger.info(f"Initializing DataFlow for target schema: {self.target_database}, {log_target}") + self.logger.debug(f"Target Details: {self.target_details.__dict__}") + + # Add operational metadata columns to the schema + if not hasattr(self.target_details, 'schema') or not self.target_details.schema: + return + + if not self.features.operationalMetadataEnabled: + return + + def _init_cdc_settings(self): + """init CDC settings.""" + + def get_scd2_columns(sequence_by_data_type: T.DataType) -> List[T.StructField]: + return [ + T.StructField(SystemColumns.SCD2Columns.SCD2_START_AT.value, sequence_by_data_type), + T.StructField(SystemColumns.SCD2Columns.SCD2_END_AT.value, sequence_by_data_type) + ] + + self.cdc_settings = self.dataflow_spec.get_cdc_settings() + self.cdc_snapshot_settings = self.dataflow_spec.get_cdc_snapshot_settings() + + # return if target has no schema + if not hasattr(self.target_details, 'schema') or not self.target_details.schema: + return + + # init CDC + if self.cdc_settings and self.cdc_settings.scd_type == "2": + + # TODO: implement dynamic sequence by type + sequence_by_data_type = self.cdc_settings.sequence_by_data_type + scd2_columns = get_scd2_columns(sequence_by_data_type) + self.target_details.add_columns(scd2_columns) + + # init CDC Snapshot + elif self.cdc_snapshot_settings and self.cdc_snapshot_settings.scd_type == "2": + + # TODO: implement dynamic sequence by type + sequence_by_data_type = self.cdc_snapshot_settings.sequence_by_data_type + scd2_columns = get_scd2_columns(sequence_by_data_type) + self.target_details.add_columns(scd2_columns) + + def _init_expectations(self): + """init expectations.""" + self.expectations_clause = None + self.expectations_enabled = self.dataflow_spec.dataQualityExpectationsEnabled + + if self.expectations_enabled: + self.logger.info("Expectations enabled") + self.expectations = self.dataflow_spec.get_data_quality_expectations() + + if self.expectations is None: + raise RuntimeError("Expectations object is None and not initialized correctly in Dataflow!") + + self.logger.debug(f"Expectations Object: {self.expectations.__dict__}") + self.expectations_clause = self.expectations.get_expectations() + + def _init_quarantine(self): + """init quarantine settings.""" + self.quarantine_enabled = ( + self.expectations_enabled + and self.expectations.all_rules + and self.dataflow_spec.quarantineMode != "off") + + self.quarantine_mode = self.dataflow_spec.quarantineMode + if self.quarantine_enabled: + self.logger.info("Quarantine enabled") + + # Initialize quarantine manager + self.quarantine_manager = QuarantineManager( + quarantine_mode=self.quarantine_mode, + data_quality_rules=self.expectations.all_rules if self.expectations_enabled else None, + target_format=self.dataflow_spec.targetFormat, + target_details=self.target_details, + quarantine_target_details=self.dataflow_spec.quarantineTargetDetails + ) + + self.target_details = self.quarantine_manager.add_quarantine_columns_delta(self.target_details) + + def _init_table_migration(self): + """init table migration.""" + self.table_migration_manager = None + + if not self.dataflow_spec.targetFormat == TargetType.DELTA: + self.logger.info("Table migration not supported for target type: %s", self.dataflow_spec.targetFormat) + return + + if not self.dataflow_spec.tableMigrationDetails: + self.logger.info("Table migration not enabled for table: %s", self.target_details.table) + return + + self.table_migration_manager = TableMigrationManager( + dataflow_spec=self.dataflow_spec, + target_database=self.target_database, + target_table_name=self.target_details.table, + cdc_settings=self.cdc_settings, + dataflow_config=self.dataflow_config, + ) + + def create_dataflow(self): + """Create the data flow based on the specifications.""" + log_target = f"target type: {self.dataflow_spec.targetFormat}, target: " + ( + getattr(self.target_details, 'sink_name') + if self.dataflow_spec.targetFormat in SinkType.__dict__.values() + else getattr(self.target_details, 'table') + ) + log_msg = ( + f"Flow ID: {self.dataflow_spec.dataFlowId}\n" + f"Flow Group: {self.dataflow_spec.dataFlowGroup}\n" + f"Target: {log_target}") + self.logger.info(log_msg) + self.logger.debug(f"Pipeline Details: {self.pipeline_details}") + self.logger.debug(f"Dataflow Specification: {self.dataflow_spec.__dict__}") + + expectations = self.expectations_clause + + # ensure expectations are converted to expect_all if quarantine FLAG mode is enabled + if self.quarantine_enabled and self.quarantine_mode == QuarantineMode.FLAG: + expectations = self.expectations.get_expectations_as_expect_all() + + if self.dataflow_spec.targetFormat == TargetType.DELTA: + + if self.target_details.type == TableType.STREAMING.value: + + # create streaming table + self.target_details.create_table(expectations) + + # setup table migration + if self.table_migration_manager: + self.table_migration_manager.create_flow() + + # create flow groups + self._create_flow_groups() + + if self.target_details.type == TableType.MATERIALIZED_VIEW.value: + + # create flow groups + self._create_flow_groups() + + # create materialized view + self.target_details.create_table(expectations) + + elif self.dataflow_spec.targetFormat in SinkType.__dict__.values(): + + # create sink + self.target_details.create_sink() + + # create flow groups + self._create_flow_groups() + + else: + raise ValueError(f"Unsupported target format: {self.dataflow_spec.targetFormat}") + + def _create_flow_groups(self): + """Create flow groups.""" + self.logger.info("Creating FlowGroups...") + for flow_group in self.flow_groups: + self._create_flow_group(flow_group) + + def _create_flow_group(self, flow_group: FlowGroup): + """Create a flow group and its associated staging tables and flows.""" + self.logger.info("Creating Flow Group: %s", flow_group.flowGroupId) + + # create staging tables + staging_tables = flow_group.get_staging_tables() + if staging_tables: + self.logger.info("Creating Staging Tables...") + for staging_table in staging_tables.values(): + staging_table.create_table() + + # Support direct historical snapshots into Staging Tables in Flows + cdc_snapshot_settings = staging_table.get_cdc_snapshot_settings() + if (cdc_snapshot_settings and cdc_snapshot_settings.is_historical()): + self.logger.info( + "Creating CDC historical snapshot source for staging table: %s", + staging_table.table + ) + + CDCSnapshotFlow(cdc_snapshot_settings).create( + dataflow_config=self.dataflow_config, + target_table=staging_table.table, + target_config_flags=staging_table.configFlags + ) + + # create flows + self.logger.info("Creating Flows...") + flows = flow_group.get_flows() + for flow in flows.values(): + if flow.enabled: + self.logger.info("Creating Flow: %s", flow.flowName) + self._create_flow(flow, staging_tables) + else: + self.logger.info("Flow Disabled: %s", flow.flowName) + + def _create_flow(self, flow: BaseFlow, staging_tables: Dict[str, StagingTable]): + """Create a flow and its associated views.""" + self.logger.info("Creating Views...") + + # Prepare Flow Configuration + is_target = self.is_target(flow.targetTable) + flow_config = self._prepare_flow_config(flow, staging_tables) + + if isinstance(flow, BaseFlowWithViews): + views = flow.get_views() or {} + + # Create views + self._create_views(views, flow.sourceView, is_target, flow_config.target_config_flags) + + # Create Flow + flow.create_flow(self.dataflow_config, flow_config) + + # Handle Table Quarantine Mode + if (self.quarantine_enabled + and self.quarantine_mode == QuarantineMode.TABLE + and is_target + ): + self.quarantine_manager.create_quarantine_flow(flow.sourceView) + + else: + # Get quarantine rules if needed. note in table mode we don't apply them to the source view, + # they are applied to a quarantine view that passes to the the quarantine target table. + quarantine_rules = None + if (self.quarantine_enabled + and self.quarantine_mode != QuarantineMode.TABLE + and is_target + ): + quarantine_rules = self.quarantine_manager.quarantine_rules + + # Create Flow + flow.create_flow(self.dataflow_config, flow_config, quarantine_rules) + + def _create_views(self, views: Dict[str, View], flow_source_view: str, is_target: bool, target_config_flags: List[str]) -> None: + """Create views for the flow, handling quarantine as needed.""" + for view in views.values(): + + # Get quarantine rules if needed. note in table mode we don't apply them to the source view, + # they are applied to a quarantine view that passes to the the quarantine target table. + quarantine_rules = None + if (self.quarantine_enabled + and self.quarantine_mode != QuarantineMode.TABLE + and is_target + and flow_source_view == view.viewName + ): + quarantine_rules = self.quarantine_manager.quarantine_rules + + # Create the view + view.create_view( + self.dataflow_config, + view_config=ViewConfig(target_config_flags=target_config_flags), + quarantine_rules=quarantine_rules + ) + + def _prepare_flow_config( + self, + flow: BaseFlow, + staging_tables: Dict[str, StagingTable] + ) -> FlowConfig: + """Prepare flow configuration.""" + # Get CDC settings + cdc_settings = self._get_cdc_settings(flow, staging_tables) + self.logger.debug("Retrieved CDC settings: %s", cdc_settings) + + # Get columns to exclude + exclude_columns = self._get_exclude_columns(flow) + + # Get column prefix exceptions + prefix_exceptions = self._get_column_prefix_exceptions() + + # Disable operational metadata if needed + target_config_flags = [] + if self.is_target(flow.targetTable): + target_config_flags = self.target_details.configFlags + else: + target_config_flags = staging_tables.get(flow.targetTable).configFlags + + return FlowConfig( + exclude_columns=exclude_columns, + target_config_flags=target_config_flags, + additional_column_prefix_exceptions=prefix_exceptions, + **cdc_settings, + ) + + def _get_cdc_settings(self, flow: BaseFlow, staging_tables: Dict[str, StagingTable]) -> Dict: + """Get CDC settings for the flow.""" + self.logger.debug("Retrieving CDC settings for table: %s", flow.targetTable) + is_target = self.is_target(flow.targetTable) + if is_target: + cdc_settings = self.cdc_settings + cdc_snapshot_settings = self.cdc_snapshot_settings + else: + staging_table = staging_tables.get(flow.targetTable) + if staging_table is None: + raise ValueError(f"Staging table not found for CDC settings retrieval: {flow.targetTable}") + cdc_settings = staging_table.get_cdc_settings() + cdc_snapshot_settings = staging_table.get_cdc_snapshot_settings() + + self.logger.debug("Retrieved CDC settings: %s \nCDC snapshot settings: %s", cdc_settings, cdc_snapshot_settings) + + return { + "cdc_settings": cdc_settings, + "cdc_snapshot_settings": cdc_snapshot_settings + } + + def _get_exclude_columns(self, flow: BaseFlow) -> List[str]: + """Get list of columns to exclude based on quarantine and CDF settings.""" + exclude_columns = [] + + # Add quarantine flag if enabled for this target + is_target = self.is_target(flow.targetTable) + if ( + self.quarantine_enabled + and self.quarantine_mode == QuarantineMode.TABLE + and is_target + ): + exclude_columns.append(QuarantineManager.QUARANTINE_COLUMN.get("name")) + + return exclude_columns + + def _get_column_prefix_exceptions(self) -> List[str]: + """Get list of columns to exclude from prefix treatment.""" + exceptions = [ + SystemColumns.SCD2Columns.SCD2_START_AT.value, + SystemColumns.SCD2Columns.SCD2_END_AT.value + ] + + return exceptions + + def is_target(self, name: str) -> bool: + """Check if the table is the target.""" + if self.dataflow_spec.targetFormat in SinkType.__dict__.values(): + return name == self.target_details.sink_name + else: + return name == self.target_details.table diff --git a/src/dataflow/dataflow_config.py b/src/dataflow/dataflow_config.py new file mode 100644 index 0000000..68f7c86 --- /dev/null +++ b/src/dataflow/dataflow_config.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass + +from .features import Features + + +@dataclass +class DataFlowConfig: + """ + Configuration for a data flow. + + Attributes: + features (Features): The features to use for the data flow. + uc_enabled (bool): Whether to use UC mode. + """ + features: Features + uc_enabled: bool = True diff --git a/src/dataflow/dataflow_spec.py b/src/dataflow/dataflow_spec.py new file mode 100644 index 0000000..9d5bc5f --- /dev/null +++ b/src/dataflow/dataflow_spec.py @@ -0,0 +1,144 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Any + +import utility + +from dataflow.cdc import CDCSettings +from dataflow.cdc_snaphot import CDCSnapshotSettings +from dataflow.enums import SourceType +from dataflow.expectations import DataQualityExpectations +from dataflow.features import Features +from dataflow.flow_group import FlowGroup +from dataflow.flows import BaseFlowWithViews +from dataflow.targets import TargetFactory +from dataflow.view import View + + +@dataclass +class DataflowSpec: + """ + Dataflow specification structure. + + Attributes: + dataFlowId (str): ID of the dataflow. + dataFlowGroup (str): Group of the dataflow. + dataFlowType (str): Type of the dataflow. + targetFormat (str): Format of the target. + targetDetails (Dict): Target details. + flowGroups (List[Dict]): List of flow groups. + tags (Dict): Dictionary containing tags. + features (dic): Dictionary containing a list of enabled optional features / fixes + cdcSettings (str, optional): CDC settings. + cdcSnapshotSettings (str, optional): CDC snapshot settings. + dataFlowVersion (str, optional): Version of the dataflow. + dataQualityExpectationsEnabled (bool, optional): Flag indicating if data quality expectations are enabled. + dataQualityExpectationsPath (str, optional): Path to the data quality expectations. + dataQualityExpectations (Dict, optional): Data quality expectations. + quarantineMode (str, optional): Quarantine mode. + quarantineTargetDetails (Dict, optional): Quarantine target details. + tableMigrationDetails (Dict, optional): Table migration details. + localPath (str, optional): Local path. + + Methods: + get_all_views(): Get all views from the flow dataflow specification. + get_all_cdf_delta_views(): Get all views with CDF enabled for Delta targets. + get_cdc_settings(): Get CDC settings + get_cdc_snapshot_settings(): Get CDC snapshot settings. + get_data_quality_expectations(): Get data quality expectations. + get_flow_groups(): Get flow groups. + get_target_details(): Get target details for Delta targets. + get_all_source_views(): Get all views that directly read from source tables. + """ + dataFlowId: str + dataFlowGroup: str + dataFlowType: str + targetFormat: str + targetDetails: Dict + flowGroups: List[Dict] + tags: Dict = field(default_factory=dict) + features: Dict = field(default_factory=dict) + cdcSettings: Dict = field(default_factory=dict) + cdcSnapshotSettings: Dict = field(default_factory=dict) + dataFlowVersion: str = None + dataQualityExpectationsEnabled: bool = False + dataQualityExpectationsPath: str = None + dataQualityExpectations: Dict = field(default_factory=dict) + quarantineMode: str = None + quarantineTargetDetails: Dict = field(default_factory=dict) + tableMigrationDetails: Dict = field(default_factory=dict) + localPath: str = None + + def __post_init__(self): + self.dataFlowType = self.dataFlowType.lower() + self.targetFormat = self.targetFormat.lower() + self.quarantineMode = self.quarantineMode.lower() if self.quarantineMode else None + + def get_all_views(self) -> Dict[str, View]: + """Retrieve all views from the flow groups.""" + all_views = {} + + for flow_group in self.get_flow_groups(): + flows = flow_group.get_flows() + for flow_name in flows: + flow = flows[flow_name] + if isinstance(flow, BaseFlowWithViews): + all_views = utility.merge_dicts(all_views, flow.get_views()) + + return all_views + + def get_all_cdf_delta_views(self) -> Dict[str, View]: + """Retrieve all views that are of source type DELTA and have CDF enabled.""" + cdf_delta_views = {} + all_views = self.get_all_views() + + for view_name in all_views: + view = all_views[view_name] + if view.sourceType == SourceType.DELTA and view.isCdfEnabled: + cdf_delta_views[view_name] = view + + return cdf_delta_views + + def get_all_delta_source_views(self) -> Dict[str, View]: + """Retrieve all views that directly read from Delta source tables.""" + source_views = {} + target_tables = [] + all_views = self.get_all_views() + + for flow_group in self.get_flow_groups(): + flows = flow_group.get_flows() + for flow_name, flow in flows.items(): + target_tables.append(flow.targetTable) + + for view_name, view in all_views.items(): + if view.sourceType == SourceType.DELTA and view.get_source_details().database.lower() != "live": + if view.get_source_details().table not in target_tables: + source_views[view.viewName] = view + + return source_views + + def get_cdc_settings(self) -> CDCSettings: + """Get CDC configuration for the target table.""" + return CDCSettings(**self.cdcSettings) \ + if self.cdcSettings else None + + def get_cdc_snapshot_settings(self) -> CDCSnapshotSettings: + """Get CDC snapshot settings for the target table.""" + return CDCSnapshotSettings(**self.cdcSnapshotSettings) \ + if self.cdcSnapshotSettings else None + + def get_data_quality_expectations(self) -> DataQualityExpectations: + """Get data quality expectations for the target table.""" + return DataQualityExpectations(**self.dataQualityExpectations) \ + if self.dataQualityExpectationsEnabled and self.dataQualityExpectations else None + + def get_features(self) -> Features: + """Get features for the target table.""" + return Features(**self.features) + + def get_flow_groups(self) -> List[FlowGroup]: + """Retrieve a list of FlowGroup objects from the flowGroups attribute.""" + return [FlowGroup(**item) for item in self.flowGroups] + + def get_target_details(self) -> Any: + """Retrieve the target details based on the target format.""" + return TargetFactory.create(self.targetFormat, self.targetDetails) diff --git a/src/dataflow/enums.py b/src/dataflow/enums.py new file mode 100644 index 0000000..6ac6b78 --- /dev/null +++ b/src/dataflow/enums.py @@ -0,0 +1,96 @@ +from dataclasses import dataclass +from enum import Enum + +@dataclass(frozen=True) +class FlowType: + """ + Enumeration of flow types. + + Attributes: + APPEND_SQL (str): Append SQL mode. + APPEND_VIEW (str): Append View mode. + MERGE (str): Merge mode. + MATERIALIZED_VIEW (str): Materialized View mode. + """ + APPEND_SQL: str = "append_sql" + APPEND_VIEW: str = "append_view" + MERGE: str = "merge" + MATERIALIZED_VIEW: str = "materialized_view" + +@dataclass(frozen=True) +class Mode: + """ + Enumeration of execution modes. + + Attributes: + BATCH (str): Batch mode. + STREAM (str): Stream mode. + """ + BATCH: str = "batch" + STREAM: str = "stream" + + +@dataclass +class QuarantineMode(): + """Constants for different quarantine modes.""" + OFF: str = "off" + FLAG: str = "flag" + TABLE: str = "table" + + +@dataclass(frozen=True) +class SourceType: + """ + Enumeration of supported source types. + + Attributes: + BATCH_FILES (str): Batch files source type. + CLOUD_FILES (str): Cloud files source type. + DELTA (str): Delta source type. + DELTA_JOIN (str): Delta join source type. + KAFKA (str): Kafka source type. + PYTHON (str): Python source type. + SQL (str): SQL source type. + """ + BATCH_FILES: str = "batchfiles" + CLOUD_FILES: str = "cloudfiles" + DELTA: str = "delta" + DELTA_JOIN: str = "deltajoin" + KAFKA: str = "kafka" + PYTHON: str = "python" + SQL: str = "sql" + + +@dataclass(frozen=True) +class SinkType: + """Enumeration of supported target types.""" + CUSTOM_PYTHON_SINK: str = "custom_python_sink" + DELTA_SINK: str = "delta_sink" + KAFKA_SINK: str = "kafka_sink" + FOREACH_BATCH_SINK: str = "foreach_batch_sink" + + +@dataclass(frozen=True) +class TargetType: + """Enumeration of supported target types.""" + DELTA: str = "delta" + DELTA_SINK: str = "delta_sink" + KAFKA_SINK: str = "kafka_sink" + FOREACH_BATCH_SINK: str = "foreach_batch_sink" + CUSTOM_PYTHON_SINK: str = "custom_python_sink" + + +class TableType(str, Enum): + """Enumeration of supported table types.""" + STREAMING: str = "st" + MATERIALIZED_VIEW: str = "mv" + + +@dataclass(frozen=True) +class TargetConfigFlags: + """Enumeration of supported target config flags. + + Attributes: + DISABLE_OPERATIONAL_METADATA (str): Disable operational metadata. + """ + DISABLE_OPERATIONAL_METADATA = "disableOperationalMetadata" diff --git a/src/dataflow/expectations.py b/src/dataflow/expectations.py new file mode 100644 index 0000000..4034975 --- /dev/null +++ b/src/dataflow/expectations.py @@ -0,0 +1,73 @@ +from dataclasses import dataclass, field +from typing import Dict + +import utility + + +@dataclass(frozen=True) +class ExpectationType(): + """Constants for different types of data quality expectations.""" + EXPECT: str = "expect" + EXPECT_OR_DROP: str = "expect_or_drop" + EXPECT_OR_FAIL: str = "expect_or_fail" + + +@dataclass +class DataQualityExpectations: + """ + Dataclass representing data quality expectations. + + Attributes: + expectationsJson (Dict): JSON data containing the raw expectations. + expectRules (Dict, optional): Rules for 'expect' expectations. + expectOrDropRules (Dict, optional): Rules for 'expect_or_drop' expectations. + expectOrFailRules (Dict, optional): Rules for 'expect_or_fail' expectations. + + Properties: + allRules (Dict): Combines all expectation rules into a single dictionary. + """ + expectationsJson: Dict + expectRules: Dict = field(default_factory=dict) + expectOrDropRules: Dict = field(default_factory=dict) + expectOrFailRules: Dict = field(default_factory=dict) + _all_rules: Dict = field(default_factory=dict) + + @property + def all_rules(self) -> Dict: + """ + Combines all expectation rules into a single dictionary. + + Returns: + Dict: A dictionary containing all expectation rules. + """ + return utility.merge_dicts( + self.expectRules, + self.expectOrDropRules, + self.expectOrFailRules) + + def get_expectations(self) -> Dict: + """ + Get expectations in format expected by SDP create table API's. + + Returns: + Dict: A dictionary containing the expectation rules as expected by SDP create table API's. + """ + return { + "expect_all": self.expectRules if self.expectRules else {}, + "expect_all_or_drop": self.expectOrDropRules if self.expectOrDropRules else {}, + "expect_all_or_fail": self.expectOrFailRules if self.expectOrFailRules else {} + } + + def get_expectations_as_expect_all(self) -> Dict: + """ + Get expectations in format expected by SDP create table API's. + Return all expectations as expect_all. + + Returns: + Dict: A dictionary containing the expectation rules as expected by SDP create table API's. + """ + return { + "expect_all": self.all_rules if self.all_rules else {}, + "expect_all_or_drop": {}, + "expect_all_or_fail": {} + } diff --git a/src/dataflow/features.py b/src/dataflow/features.py new file mode 100644 index 0000000..1aa8571 --- /dev/null +++ b/src/dataflow/features.py @@ -0,0 +1,16 @@ +from dataclasses import dataclass + + +@dataclass +class Features: + """ + Features definition structure. + + Attributes: + operationalMetadataEnabled (bool): Whether to enable the operational metadata feature. + """ + operationalMetadataEnabled: bool = True + + def __post_init__(self): + if self.operationalMetadataEnabled is None: + self.operationalMetadataEnabled = True diff --git a/src/dataflow/flow_group.py b/src/dataflow/flow_group.py new file mode 100644 index 0000000..bd2be92 --- /dev/null +++ b/src/dataflow/flow_group.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass, field +from typing import Dict + +from .flows import BaseFlow, FlowFactory +from .targets import StagingTable + + +@dataclass +class FlowGroup: + """ + Flow group definition structure. + + Attributes: + flowGroupId (str): ID of the flow group. + flows (List[Dict]): List of flows. + dataFlowId (str, optional): ID of the data flow. + stagingTables (Dict[str, Dict], optional): Dictionary of staging tables. + + Methods: + get_flows() -> Dict[str, BaseFlow]: Get flows associated with the flow group. + get_staging_tables() -> Dict[str, StagingTable]: Get staging tables associated with the flow group. + """ + flowGroupId: str + flows: Dict[str, Dict] + dataFlowId: str = None + stagingTables: Dict[str, Dict] = field(default_factory=dict) + + def get_flows(self) -> Dict[str, BaseFlow]: + """Retrieve all flows as a dictionary of Flow objects.""" + flows = {} + for key, value in self.flows.items(): + if key in flows: + raise ValueError(f"Multiple flows found with the same name: {key}") + flows[key] = FlowFactory.create(key, value) + return flows + + def get_staging_tables(self) -> Dict[str, StagingTable]: + """Retrieve the staging tables.""" + staging_tables = {} + for key, value in self.stagingTables.items(): + table = key + if value.get("database", None): + key = f"{value['database']}.{key}" + if key in staging_tables: + raise ValueError(f"Multiple staging tables found with the same name: {key}") + staging_tables[key] = StagingTable(table=table, **value) + return staging_tables diff --git a/src/dataflow/flows/__init__.py b/src/dataflow/flows/__init__.py new file mode 100644 index 0000000..32eb40a --- /dev/null +++ b/src/dataflow/flows/__init__.py @@ -0,0 +1,17 @@ +from .base import BaseFlow, BaseFlowWithViews, FlowConfig +from .append_view import FlowAppendView +from .append_sql import FlowAppendSql +from .merge import FlowMerge +from .materialized_view import FlowMaterializedView +from .factory import FlowFactory + +__all__ = [ + 'BaseFlow', + 'BaseFlowWithViews', + 'FlowConfig', + 'FlowAppendView', + 'FlowAppendSql', + 'FlowMerge', + 'FlowMaterializedView', + 'FlowFactory' +] diff --git a/src/dataflow/flows/append_sql.py b/src/dataflow/flows/append_sql.py new file mode 100644 index 0000000..79ed787 --- /dev/null +++ b/src/dataflow/flows/append_sql.py @@ -0,0 +1,70 @@ +from pyspark import pipelines as dp + +import utility + +from ..dataflow_config import DataFlowConfig +from ..enums import Mode +from ..sources.sql import SourceSql +from ..sources.base import ReadConfig +from ..sql import SqlMixin + +from .base import BaseFlow, FlowConfig + + +class FlowAppendSql(BaseFlow, SqlMixin): + """Flow implementation for SQL-based append operations. + + This class handles the creation of append flows that source their data from User defined + SQL Queries. + + Attributes: + flowType (str): Type of the flow (append_sql, append_view, merge). + flowDetails (Dict): Details specific to the flow. + enabled (bool): Whether the flow is enabled. + sqlPath (str): Path to the SQL file containing the transformation logic. + sqlStatement (str): SQL statement to be executed. + + Methods: + get_views() -> Dict: Get the views associated with this flow. + create_flow(config: FlowConfig) -> None: Create a flow using the provided configuration. + """ + def __post_init__(self): + """Post-initialization hook.""" + super().__post_init__() + self.sqlPath = self.flowDetails.get("sqlPath", None) + self.sqlStatement = self.flowDetails.get("sqlStatement", None) + self.once = self.flowDetails.get("once", False) + + def create_flow( + self, + dataflow_config: DataFlowConfig, + flow_config: FlowConfig, + quarantine_rules: str = None + ): + """Create an append flow from SQL. + + Args: + config: FlowConfig object containing all necessary parameters + """ + exclude_columns = flow_config.exclude_columns + + source_sql = SourceSql( + sqlPath=self.sqlPath, + sqlStatement=self.sqlStatement + ) + + read_config = ReadConfig( + features=dataflow_config.features, + mode=Mode.STREAM, + quarantine_rules=quarantine_rules, + uc_enabled=dataflow_config.uc_enabled + ) + + self.logger.debug(f"Append SQL Flow: {self.flowName}. SQL Statement: {source_sql.rawSql}") + + @dp.append_flow(name=self.flowName, target=self.targetTable, once=self.once) + def flow_transform(): + df = source_sql.read_source(read_config) + if exclude_columns: + df = utility.drop_columns(df, exclude_columns) + return df diff --git a/src/dataflow/flows/append_view.py b/src/dataflow/flows/append_view.py new file mode 100644 index 0000000..a5a3355 --- /dev/null +++ b/src/dataflow/flows/append_view.py @@ -0,0 +1,76 @@ +from typing import List + +from pyspark import pipelines as dp + +import pipeline_config +import utility + +from ..dataflow_config import DataFlowConfig + +from .base import BaseFlowWithViews, FlowConfig + + +class FlowAppendView(BaseFlowWithViews): + """ + Create an append flow from SQL. + + Attributes: + sourceView (str): Name of the source view. + columnPrefix (str): Prefix for column names. + columnPrefixExceptions (List[str]): List of column names to exclude from prefix treatment. + """ + @property + def columnPrefix(self) -> str: + """Get the column prefix.""" + return self.flowDetails["column_prefix"] + + @property + def columnPrefixExceptions(self) -> List[str]: + """Get the column prefix exceptions.""" + return self.flowDetails.get("column_prefix_exceptions", []) + + @property + def once(self) -> bool: + """Get the once flag.""" + return self.flowDetails.get("once", False) + + def create_flow( + self, + dataflow_config: DataFlowConfig, + flow_config: FlowConfig + ): + """Create an append flow from a view. + + Args: + config: FlowConfig object containing all necessary parameters + """ + + def get_column_prefix_exceptions(flow_config: FlowConfig) -> List[str]: + """Get the column prefix exceptions.""" + column_prefix_exceptions = self.columnPrefixExceptions + column_prefix_exceptions.extend(flow_config.additional_column_prefix_exceptions) + operational_metadata_schema = pipeline_config.get_operational_metadata_schema() + if operational_metadata_schema: + column_prefix_exceptions.extend(operational_metadata_schema.fields) + return column_prefix_exceptions + + spark = self.spark + exclude_columns = flow_config.exclude_columns + column_prefix_exceptions = get_column_prefix_exceptions(flow_config) + + source_view_name = f'live.{self.sourceView}' + + @dp.append_flow(name=self.flowName, target=self.targetTable, once=self.once) + def flow_transform(): + df = spark.readStream.table(source_view_name) + if "column_prefix" in self.flowDetails: + prefix = f"{self.columnPrefix.lower()}_" + df = df.select([ + df[column].alias(prefix + column) + if column not in column_prefix_exceptions + else df[column] for column in df.columns + ]) + + if exclude_columns: + df = utility.drop_columns(df, exclude_columns) + return df diff --git a/src/dataflow/flows/base.py b/src/dataflow/flows/base.py new file mode 100644 index 0000000..26c2646 --- /dev/null +++ b/src/dataflow/flows/base.py @@ -0,0 +1,109 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +from dataflow import targets +import pipeline_config + +from ..cdc import CDCSettings +from ..cdc_snaphot import CDCSnapshotSettings +from ..dataflow_config import DataFlowConfig +from ..view import View + + +@dataclass +class FlowConfig: + """Configuration for flow creation. + + This class encapsulates all possible parameters needed by different flow types, + making the interface consistent while allowing for type-specific parameters. + + Attributes: + exclude_columns: List of columns to exclude from the data + additional_column_prefix_exceptions: List of additional columns to exclude from prefix + cdc_settings: The CDC Settings. + cdc_snapshot_settings: The CDC Snapshot Settings. + target_config_flags: The target config flags. + """ + exclude_columns: Optional[List[str]] = None + additional_column_prefix_exceptions: Optional[List[str]] = None + cdc_settings: Optional[CDCSettings] = None + cdc_snapshot_settings: Optional[CDCSnapshotSettings] = None + target_config_flags: Optional[List[str]] = None + + +@dataclass(kw_only=True) +class BaseFlow(ABC): + """ + Represents a data flow in the pipeline. + + Attributes: + flowType (str): Type of the flow (append_sql, append_view, merge). + flowDetails (Dict): Details specific to the flow. + enabled (bool): Whether the flow is enabled. + + Properties: + targetTable (str): Target table. + + Methods: + create_flow(config: FlowConfig) -> None: Create a flow using the provided configuration. + """ + flowName: str + flowType: str + flowDetails: Dict + enabled: bool = True + + def __post_init__(self): + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.substitution_manager = pipeline_config.get_substitution_manager() + + @property + def targetTable(self) -> str: + """Get the target name.""" + return self.flowDetails.get("targetTable", None) + + @abstractmethod + def create_flow(self, dataflow_config: DataFlowConfig, flow_config: FlowConfig): + """Create a flow using the provided configuration. + + Args: + config: FlowConfig object containing all necessary parameters + """ + pass + + +@dataclass(kw_only=True) +class BaseFlowWithViews(BaseFlow): + """ + Represents a data flow in the pipeline. + + Attributes: + flowType (str): Type of the flow (append_sql, append_view, merge). + flowDetails (Dict): Details specific to the flow. + enabled (bool): Whether the flow is enabled. + views (Dict[str, View]): Views associated with this flow. + + Properties: + sourceView (str): Source view. + targetTable (str): Target table. + + Methods: + get_views() -> Dict: Get the views associated with this flow. + create_flow(config: FlowConfig) -> None: Create a flow using the provided configuration. + """ + views: Dict = field(default_factory=dict) + + @property + def sourceView(self) -> str: + """Get the source view.""" + return self.flowDetails.get("sourceView", None) + + def get_views(self) -> Dict[str, View]: + """Get the views associated with this flow.""" + views = {} + for key, value in self.views.items(): + if key in views: + raise ValueError(f"Multiple views found with the same name: {key}") + views[key] = View(viewName=key, **value) + return views diff --git a/src/dataflow/flows/factory.py b/src/dataflow/flows/factory.py new file mode 100644 index 0000000..7fb48a0 --- /dev/null +++ b/src/dataflow/flows/factory.py @@ -0,0 +1,56 @@ +from typing import Dict, Type + +from ..enums import FlowType +from .base import BaseFlow +from .append_view import FlowAppendView +from .append_sql import FlowAppendSql +from .merge import FlowMerge +from .materialized_view import FlowMaterializedView + +class FlowFactory: + """Factory for creating Flow instances.""" + + # Registry of target types to their corresponding classes + _flow_registry: Dict[str, Type[BaseFlow]] = { + FlowType.APPEND_SQL: FlowAppendSql, + FlowType.APPEND_VIEW: FlowAppendView, + FlowType.MERGE: FlowMerge, + FlowType.MATERIALIZED_VIEW: FlowMaterializedView + } + + @classmethod + def create( + cls, + flow_name: str, + flow_details: Dict + ) -> BaseFlow: + """ + Create a Flow instance based on the flow type. + + Args: + flow_name: The name of the flow to create + flow_details: Configuration dictionary for the flow + + Returns: + BaseFlow: An instance of the appropriate BaseFlow class + + Raises: + ValueError: If flow_type is not supported + """ + flow_type = flow_details["flowType"].lower() + + # Check if flow type is supported + if flow_type not in cls._flow_registry: + supported = ", ".join(cls._flow_registry.keys()) + raise ValueError( + f'Unsupported flow type "{flow_type}". ' + f'Supported formats are: {supported}' + ) + + # Get the appropriate flow class + flow_class = cls._flow_registry[flow_type] + + # Create and return the flow instance + flow = flow_class(flowName=flow_name, **flow_details) + + return flow diff --git a/src/dataflow/flows/materialized_view.py b/src/dataflow/flows/materialized_view.py new file mode 100644 index 0000000..1b9f323 --- /dev/null +++ b/src/dataflow/flows/materialized_view.py @@ -0,0 +1,15 @@ +from ..dataflow_config import DataFlowConfig + +from .base import BaseFlowWithViews, FlowConfig + +class FlowMaterializedView(BaseFlowWithViews): + """ + Create a materialized view flow. + """ + def create_flow( + self, + dataflow_config: DataFlowConfig, + flow_config: FlowConfig + ): + """Create a materialized view flow.""" + pass diff --git a/src/dataflow/flows/merge.py b/src/dataflow/flows/merge.py new file mode 100644 index 0000000..2967b8f --- /dev/null +++ b/src/dataflow/flows/merge.py @@ -0,0 +1,38 @@ +from ..cdc import CDCFlow +from ..cdc_snaphot import CDCSnapshotFlow +from ..dataflow_config import DataFlowConfig + +from .base import BaseFlowWithViews, FlowConfig + + +class FlowMerge(BaseFlowWithViews): + """ + Create a merge flow. + """ + def create_flow( + self, + dataflow_config: DataFlowConfig, + flow_config: FlowConfig + ): + """Create a merge flow based on the provided details, views, and staging tables.""" + cdc_settings = flow_config.cdc_settings + cdc_snapshot_settings = flow_config.cdc_snapshot_settings + exclude_columns = flow_config.exclude_columns + target_config_flags = flow_config.target_config_flags + + if cdc_settings: + CDCFlow(cdc_settings).create( + target_table=self.targetTable, + source_view_name=self.sourceView, + flow_name=self.flowName, + additional_except_columns=exclude_columns + ) + + elif cdc_snapshot_settings: + CDCSnapshotFlow(cdc_snapshot_settings).create( + dataflow_config=dataflow_config, + target_table=self.targetTable, + source_view_name=self.sourceView, + flow_name=self.flowName, + target_config_flags=target_config_flags + ) diff --git a/src/dataflow/operational_metadata.py b/src/dataflow/operational_metadata.py new file mode 100644 index 0000000..4cc9887 --- /dev/null +++ b/src/dataflow/operational_metadata.py @@ -0,0 +1,72 @@ +from enum import Enum +from typing import Dict, Optional, Any, Callable + +from pyspark.sql import DataFrame, SparkSession +import pyspark.sql.types as T +from pyspark.sql import functions as F + +import utility + + +class MetadataMappingType(Enum): + """Enum for metadata mapping types.""" + SQL = "sql" + PIPELINE_DETAIL = "pipeline_detail" + CUSTOM = "custom" + + +class OperationalMetadataMixin: + """Mixin class for adding operational metadata to DataFrames.""" + + def _add_operational_metadata( + self, + spark: SparkSession, + df: DataFrame, + operational_metadata_schema: T.StructType, + pipeline_details: Dict[str, Any] + ) -> DataFrame: + """Add operational metadata to the DataFrame based on the schema.""" + + def get_metadata_handler( + mapping_type: str + ) -> Callable[[Dict[str, Any], T.StructField], Optional[Any]]: + """Get the appropriate metadata handler based on mapping type.""" + handlers = { + MetadataMappingType.SQL.value: + lambda m, c: F.expr(m.get("sql", "")).cast(c.dataType).alias(c.name), + MetadataMappingType.PIPELINE_DETAIL.value: + lambda m, c: F.lit(pipeline_details.get(m.get("key", ""))).cast(c.dataType).alias(c.name), + MetadataMappingType.CUSTOM.value: + lambda m, c: None # TODO: Implement custom field handling + } + return handlers.get(mapping_type, lambda m, c: None) + + def process_field(column: T.StructField) -> Any: + """Process a field and handle nested structures recursively.""" + mapping = column.metadata.get("mapping", {}) + mapping_type = mapping.get("type", "") + + handler = get_metadata_handler(mapping_type) + if handler: + result = handler(mapping, column) + if result is not None: + return result + + if isinstance(column.dataType, T.StructType): + nested_fields = [process_field(f) for f in column.dataType.fields] + return F.struct([f for f in nested_fields if f is not None]).alias(column.name) + + return None + + if not operational_metadata_schema: + return df + + # Get the update id from the spark conf + pipeline_details["update_id"] = utility.get_pipeline_update_id(spark) + + for column in operational_metadata_schema.fields: + result = process_field(column) + if result is not None: + df = df.withColumn(column.name, result) + + return df \ No newline at end of file diff --git a/src/dataflow/quarantine.py b/src/dataflow/quarantine.py new file mode 100644 index 0000000..fd4112f --- /dev/null +++ b/src/dataflow/quarantine.py @@ -0,0 +1,231 @@ +from typing import Dict + +from pyspark import pipelines as dp +import pyspark.sql.functions as F + +from constants import MetaDataColumnDefs, SystemColumns +import pipeline_config +import utility + +from .enums import QuarantineMode, TableType, TargetType, Mode +from .targets import ( + TargetFactory, + TargetDeltaMaterializedView, + TargetDeltaStreamingTable +) + + +class QuarantineManager(): + """ + Manager for quarantine operations. + + Attributes: + quarantine_mode (str): Quarantine mode. + data_quality_rules (Dict): Data quality expectations. + quarantineTargetDetails (Dict): Quarantine target details. + quarantine_table (TargetDelta): Quarantine table. + + Methods: + create_quarantine_table: Create the quarantine table. + add_quarantine_columns: Add quarantine columns to the target details. + create_quarantine_flow: Create quarantine flow and view. + """ + + QUARANTINE_COLUMN = MetaDataColumnDefs.QUARANTINE_FLAG + CDF_COLUMNS = [column.value for column in SystemColumns.CDFColumns] + + def __init__( + self, + quarantine_mode: str, + data_quality_rules: Dict = None, + target_format: str = TargetType.DELTA, + target_details: TargetDeltaStreamingTable | TargetDeltaMaterializedView = None, + quarantine_target_details: Dict = None + ): + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.substitution_manager = pipeline_config.get_substitution_manager() + self.mandatory_table_properties = pipeline_config.get_mandatory_table_properties() + self.quarantine_mode = quarantine_mode + self.data_quality_rules = data_quality_rules + self.target_format = target_format + self.quarantine_target_details = quarantine_target_details + self.target_details = target_details + self.quarantine_rules = f"NOT({ ' AND '.join(data_quality_rules.values()) })" + self.mode = Mode.STREAM if ( + (self.target_format == TargetType.DELTA and self.target_details.type == TableType.STREAMING.value) + or self.target_format in (TargetType.KAFKA_SINK) #TODO: Add other types as they become supported + ) else Mode.BATCH + self.target = getattr(self.target_details, 'table') or getattr(self.target_details, 'sink_name') + self.quarantine_table = None + + self._init_quarantine() + + def _init_quarantine(self): + """Initialize quarantine mode.""" + if self.quarantine_mode != QuarantineMode.TABLE: + return + + quarantine_details = { + "table": f"{self.target}_quarantine" if not self.quarantine_target_details.get("table", None) else self.quarantine_target_details.get("table"), + "database": self.quarantine_target_details.get("database", None) if not self.quarantine_target_details.get("table", None) else None, + "tableProperties": utility.merge_dicts( + self.quarantine_target_details.get("tableProperties", {}), + self.mandatory_table_properties + ), + "partitionColumns": self.quarantine_target_details.get("partitionColumns", None), + "clusterByColumns": self.quarantine_target_details.get("clusterByColumns", None), + "clusterByAuto": self.quarantine_target_details.get("clusterByAuto", False), + "tablePath": self.quarantine_target_details.get("path", None) + } + if self.mode == Mode.STREAM: + + quarantine_details["type"] = TableType.STREAMING.value + self._create_quarantine_table(quarantine_details) + + if self.mode == Mode.BATCH: + + quarantine_view_name=f"v_{self.target}_quarantine" + quarantine_details["type"] = TableType.MATERIALIZED_VIEW.value + quarantine_details["sourceView"] = quarantine_view_name + + quarantine_view_name=f"v_{self.target}_quarantine" + self._create_quarantine_view_mv( + quarantine_view_name=quarantine_view_name, + target_details=self.target_details + ) + + self._create_quarantine_table(quarantine_details) + + def _create_quarantine_table(self, quarantine_details: Dict): + """Create the quarantine table.""" + self.quarantine_table = TargetFactory.create(TargetType.DELTA, quarantine_details) + + self.logger.info("Creating Quarantine Table: %s, Mode: %s, Partition Columns: %s, Cluster By Columns: %s, Cluster By Auto: %s", + self.quarantine_table.table, self.mode, self.quarantine_table.partitionColumns, + self.quarantine_table.clusterByColumns, self.quarantine_table.clusterByAuto) + + self.quarantine_table.create_table() + + def add_quarantine_columns_delta( + self, + target_details: TargetDeltaStreamingTable | TargetDeltaMaterializedView + ) -> TargetDeltaStreamingTable | TargetDeltaMaterializedView: + """ + Add quarantine columns to the target details. + + Args: + target_details (TargetDelta): Target details. + """ + if self.target_format == TargetType.DELTA and self.quarantine_mode == QuarantineMode.FLAG: + if target_details.schema: + return target_details.add_columns([QuarantineManager.QUARANTINE_COLUMN]) + + return target_details + + def create_quarantine_flow(self, source_view_name: str): + """ + Create quarantine flows and views. + + Args: + flow_groups (List[FlowGroup]): List of flow groups. + """ + if self.quarantine_mode != QuarantineMode.TABLE: + self.logger.info("Quarantine mode is not table, skipping quarantine flow creation.") + return + + if self.mode == Mode.STREAM: + + quarantine_view_name = f"{source_view_name}_quarantine" + + self._create_quarantine_view_st( + quarantine_view_name=quarantine_view_name, + source_view_name=source_view_name + ) + + self._create_quarantine_flow( + quarantine_view_name=quarantine_view_name, + quarantine_table_name=self.quarantine_table.table + ) + + else: + msg = "Cannot create quarantine flow for batch mode. Batch mode only requires the quarantine MV." + self.logger.error(msg) + raise ValueError(msg) + + def _create_quarantine_view_mv( + self, + quarantine_view_name: str, + target_details: TargetDeltaMaterializedView + ) -> str: + """Create a view with the quarantine flag.""" + quarantine_column_name = QuarantineManager.QUARANTINE_COLUMN["name"] + + self.logger.info("Creating Quarantine View: %s", quarantine_view_name) + self.logger.debug("Quarantine Rules for %s: %s", quarantine_view_name, self.quarantine_rules) + + def get_quarantine_view(): + + df = None + if target_details.sourceView: + df = self.spark.read.table(f"live.{target_details.sourceView}") + elif target_details.rawSql: + sql = self.substitution_manager.substitute_string(target_details.rawSql) + df = self.spark.sql(sql) + else: + raise ValueError("No source view or sql path or sql statement provided") + + return (df + .withColumn(quarantine_column_name, F.expr(self.quarantine_rules)) + .where(f"{quarantine_column_name} = 1") + .drop(quarantine_column_name) + ) + + dp.view( + get_quarantine_view, + name=quarantine_view_name, + comment="Final view with quarantine flag", + ) + + def _create_quarantine_view_st( + self, + quarantine_view_name: str, + source_view_name: str + ): + """Create a view with the quarantine flag.""" + quarantine_column_name = QuarantineManager.QUARANTINE_COLUMN["name"] + + self.logger.info("Creating Quarantine View: %s", quarantine_view_name) + self.logger.debug("Quarantine Rules for %s: %s", quarantine_view_name, self.quarantine_rules) + + def get_quarantine_view(): + df = self.spark.readStream.table(f"live.{source_view_name}") + df = df.withColumn(quarantine_column_name, F.expr(self.quarantine_rules)) + return df + + dp.view( + get_quarantine_view, + name=quarantine_view_name, + comment="Final view with quarantine flag", + ) + + def _create_quarantine_flow( + self, + quarantine_view_name: str, + quarantine_table_name: str + ): + """Create a flow to append to quarantine table for flagged rows.""" + self.logger.info("Creating Quarantine Append Flow, Source View: %s, Quarantine Table: %s", + quarantine_view_name, quarantine_table_name) + + quarantine_column_name = QuarantineManager.QUARANTINE_COLUMN["name"] + columns_to_drop = [quarantine_column_name] + QuarantineManager.CDF_COLUMNS + + @dp.append_flow( + name=f"f_quarantine_{quarantine_view_name}", + target=quarantine_table_name) + def quarantined_rows(): + df = self.spark.readStream.table(f"live.{quarantine_view_name}").where(f"{quarantine_column_name} = 1") + df = utility.drop_columns(df, columns_to_drop) + return df + \ No newline at end of file diff --git a/src/dataflow/schema.py b/src/dataflow/schema.py new file mode 100644 index 0000000..a4b5009 --- /dev/null +++ b/src/dataflow/schema.py @@ -0,0 +1,99 @@ +from abc import ABC +from dataclasses import dataclass, field +from typing import List, Dict, Optional, Union +import os + +import pyspark.sql.types as T + +import pipeline_config +import utility + + +@dataclass +class SchemaMixin(ABC): + """ + Mixin for schema retrieval and parsing. + + Attributes: + schemaPath (str, optional): Path to the schema file (JSON or DDL format). + + Properties: + schema_type (str): Type of schema ["json", "ddl"]. + schema (Union[Dict, str]): Schema structure. + schema_json (Dict): Schema JSON. + schema_struct (StructType): Schema structure. + schema_ddl (str): Schema DDL. + + Methods: + add_columns: Add columns to the target schema. + remove_columns: Remove columns from the target schema. + """ + schemaPath: Optional[str] = None + _schema_type: Optional[str] = None + _schema_json: Dict = field(default_factory=dict) + _schema_ddl: str = field(default_factory=str) + _schema_struct: T.StructType = field(default=None, init=False) + _schema_lines: List[str] = field(default_factory=list) + _schema_constraints: List[str] = field(default_factory=list) + + def _initialize_schema(self) -> None: + """Initialize the schema from the schema path.""" + # Get schema type + file_extension = os.path.splitext(self.schemaPath)[1].lower() + if file_extension not in ['.json', '.ddl']: + raise ValueError(f"Unsupported schema file extension: {file_extension}. Only .json and .ddl are supported.") + + # Set schema type + self._schema_type = file_extension[1:] + + # Get schema + if file_extension == '.json': + self._schema_json = utility.get_json_from_file(self.schemaPath) + self._schema_struct = T.StructType.fromJson(self._schema_json) + if not isinstance(self._schema_json, dict): + raise ValueError(f"Invalid JSON schema format in {self.schemaPath}") + elif file_extension == '.ddl': + with open(self.schemaPath, 'r') as f: + schema_ddl = f.read() + substitution_manager = pipeline_config.get_substitution_manager() + schema_ddl = substitution_manager.substitute_string(schema_ddl) + self._schema_ddl = schema_ddl + + # Parse schema + schema_lines = self._schema_ddl.split("\n") + schema_lines = [line.strip().rstrip(",") for line in schema_lines] + schema_lines = [line for line in schema_lines if not line.strip().startswith("--")] + schema_constraints = [line for line in schema_lines if line.strip().startswith("CONSTRAINT ")] + schema_lines = [line for line in schema_lines if not line.strip().startswith("CONSTRAINT ")] + self._schema_lines = schema_lines + self._schema_constraints = substitution_manager.substitute_string(schema_constraints) + + @property + def schema_type(self) -> Optional[str]: + """Get the schema type.""" + return self._schema_type + + @property + def schema(self) -> Union[T.StructType, str]: + """Get the schema.""" + if self._schema_type == "json": + return self.schema_json + elif self._schema_type == "ddl": + return self.schema_ddl + + @property + def schema_json(self) -> Dict: + """Get the schema from the schema path.""" + return self._schema_struct.jsonValue() + + @property + def schema_struct(self) -> T.StructType: + """Get the schema struct from the schema path.""" + return self._schema_struct + + @property + def schema_ddl(self) -> str: + """Get the schema from the schema path.""" + schema_lines = self._schema_lines + self._schema_constraints + return ",\n".join(schema_lines) + \ No newline at end of file diff --git a/src/dataflow/sources/__init__.py b/src/dataflow/sources/__init__.py new file mode 100644 index 0000000..f3aa51c --- /dev/null +++ b/src/dataflow/sources/__init__.py @@ -0,0 +1,22 @@ +from .base import ReadConfig, BaseSource +from .batch_files import SourceBatchFiles +from .cloud_files import SourceCloudFiles +from .delta import SourceDelta +from .delta_join import SourceDeltaJoin +from .kafka import SourceKafka +from .python import SourcePython +from .sql import SourceSql +from .factory import SourceFactory + +__all__ = [ + 'ReadConfig', + 'SourceBatchFiles', + 'BaseSource', + 'SourceCloudFiles', + 'SourceDelta', + 'SourceDeltaJoin', + 'SourceKafka', + 'SourcePython', + 'SourceSql', + 'SourceFactory' +] diff --git a/src/dataflow/sources/base.py b/src/dataflow/sources/base.py new file mode 100644 index 0000000..7b3428b --- /dev/null +++ b/src/dataflow/sources/base.py @@ -0,0 +1,290 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +import importlib.util +import os +from typing import Dict, List, Optional, TypeVar, Any + +from pyspark.sql import DataFrame +from pyspark.sql import functions as F +import pyspark.sql.types as T + +from constants import MetaDataColumnDefs, SystemColumns +import pipeline_config +import utility + +from ..enums import TargetConfigFlags +from ..features import Features +from ..operational_metadata import OperationalMetadataMixin +from ..sql import SqlMixin + +Self = TypeVar("Self", bound="BaseSource") + + +@dataclass +class ReadConfig: + """ + Configuration for reading data from a source. + + Attributes: + features: feature flags. + mode (str): The mode of reading (e.g., "stream" or "batch"). + quarantine_rules (str): The quarantine rules. + uc_enabled (bool): Whether to use UC mode. + target_config_flags (List[str]): The target config flags. + """ + features: Features + mode: str + quarantine_rules: Optional[str] = None + uc_enabled: Optional[bool] = True + target_config_flags: Optional[List[str]] = None + + def __post_init__(self) -> None: + """Validate configuration after initialization.""" + if not isinstance(self.mode, str): + raise ValueError("Mode must be a string") + if self.mode not in ("stream", "batch"): + raise ValueError("Mode must be either 'stream' or 'batch'") + + +@dataclass(kw_only=True) +class BaseSource(OperationalMetadataMixin, ABC): + """ + Base class for all non-sql based source details implementations. + + Attributes: + readerOptions (Dict, optional): Reader options. + selectExp (List[str], optional): List of select expressions. + whereClause (List[str], optional): List of WHERE clauses. + pythonTransform (Dict, optional): Python transform configuration with: + - functionPath (str): Path to Python file containing 'apply_transform' function + - module (str): Module.function reference (e.g., 'transforms.apply_transform') + - tokens (Dict): Token values to pass to the transform function + + Methods: + add_reader_options(reader_options: Dict): Add or update reader options. + get_df(mode: str) -> DataFrame: Get a DataFrame from the source details. + """ + readerOptions: Optional[Dict[str, Any]] = field(default_factory=dict) + selectExp: Optional[List[str]] = field(default_factory=list) + whereClause: Optional[List[str]] = field(default_factory=list) + pythonTransform: Optional[Dict[str, Any]] = field(default_factory=dict) + + + def __post_init__(self) -> None: + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.substitution_manager = pipeline_config.get_substitution_manager() + self.operational_metadata_schema = pipeline_config.get_operational_metadata_schema() + self.pipeline_details = pipeline_config.get_pipeline_details() + + @abstractmethod + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Get a DataFrame from the source details.""" + pass + + def add_reader_options(self, reader_options: Dict[str, Any]) -> Self: + """Add or update reader options.""" + self.readerOptions.update(reader_options) + return self + + def read_source(self, read_config: ReadConfig) -> DataFrame: + """Get a DataFrame from the source details with applied transformations.""" + spark = self.spark + logger = self.logger + operational_metadata_schema = self.operational_metadata_schema + pipeline_details = self.pipeline_details + features = read_config.features + target_config_flags = read_config.target_config_flags or [] + + df = self._get_df(read_config) + df = self._apply_python_function(df) + df = self._apply_where_clause(df) + df = self._apply_select_exp(df) + + # Drop cdf columns if present. Important as they are not allowed in the target table. + # If data engineers need them they can alias them in the selectExp + cdf_columns = [column.value for column in SystemColumns.CDFColumns] + df = utility.drop_columns(df, cdf_columns) + + # Add operational metadata + if (features.operationalMetadataEnabled + and self.operational_metadata_schema + and TargetConfigFlags.DISABLE_OPERATIONAL_METADATA not in target_config_flags + ): + logger.debug("Adding operational metadata to DataFrame.") + df = self._add_operational_metadata( + spark, + df, + operational_metadata_schema, + pipeline_details.__dict__ + ) + + # Add quarantine flag + quarantine_rules = read_config.quarantine_rules + if quarantine_rules and quarantine_rules.strip(): + logger.debug("Adding quarantine flag to DataFrame.") + df = df.withColumn( + MetaDataColumnDefs.QUARANTINE_FLAG["name"], + F.expr(quarantine_rules) + ) + + return df + + def _apply_where_clause(self, df: DataFrame) -> DataFrame: + """Apply WHERE clauses to a DataFrame.""" + for clause in self.whereClause: + if clause.strip(): + df = df.where(clause) + return df + + def _apply_select_exp(self, df: DataFrame) -> DataFrame: + """Apply SELECT expressions to a DataFrame.""" + return df.selectExpr(*self.selectExp) if self.selectExp else df + + def _apply_python_function(self, df: DataFrame) -> DataFrame: + """ + Apply a custom Python transform function to a DataFrame if specified. + + Supports two methods via pythonTransform: + - module: Import function from an extension module (recommended) + - functionPath: Load function from a Python file + + The function signature depends on whether tokens are provided: + - With tokens: apply_transform(df, tokens) -> DataFrame + - Without tokens: apply_transform(df) -> DataFrame + """ + if not self.pythonTransform: + return df + + logger = self.logger + function_path = self.pythonTransform.get("functionPath") + module_ref = self.pythonTransform.get("module") + tokens = self.pythonTransform.get("tokens", {}) + tokens = self.substitution_manager.substitute_dict(tokens) + + # Load the function from module or path + if module_ref: + logger.debug(f"Applying Python transform from module: {module_ref}") + function = utility.load_python_function_from_module(module_ref) + elif function_path: + logger.debug(f"Applying Python transform from path: {function_path}") + if not os.path.exists(function_path): + raise FileNotFoundError(f"Python transform file not found: {function_path}") + + spec = importlib.util.spec_from_file_location("custom_transform", function_path) + if spec is None or spec.loader is None: + raise ImportError(f"Failed to load Python transform from: {function_path}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + if not hasattr(module, 'apply_transform'): + raise AttributeError( + f"Python transform file '{function_path}' must contain an 'apply_transform' function" + ) + function = module.apply_transform + else: + raise ValueError("pythonTransform must specify either 'functionPath' or 'module'") + + # Apply the transformation + if tokens: + logger.debug(f"Applying transform with tokens: {tokens}") + return function(df, tokens) + return function(df) + + +@dataclass +class BaseSourceWithSchemaOnRead(BaseSource): + """ + Base class for all non-sql based source details implementations that support schema on read. + + Attributes: + readerOptions (Dict, optional): Reader options. + selectExp (List[str], optional): List of select expressions. + whereClause (List[str], optional): List of WHERE clauses. + pythonTransform (Dict, optional): Python transform configuration. + schemaPath (str, optional): Path to the schema file (JSON or DDL format). + + Properties: + schema_json (Dict, optional): Schema JSON. + schema_struct (StructType, optional): Schema struct. + + Methods: + add_reader_options(reader_options: Dict): Add or update reader options. + get_df(mode: str) -> DataFrame: Get a DataFrame from the source details. + """ + schemaPath: str = None + _schema_json: Dict[str, Any] = field(default_factory=dict, init=False) + + @property + def schema_json(self) -> Dict[str, Any]: + """Lazily load the schema JSON from the schema path.""" + if not self._schema_json and self.schemaPath and self.schemaPath.strip() != "": + self._schema_json = utility.get_json_from_file(self.schemaPath) + return self._schema_json + + @property + def schema_struct(self) -> T.StructType: + """Lazily load the schema from the schema path.""" + return T.StructType.fromJson(self.schema_json) if self.schema_json else None + + + +@dataclass +class BaseSourceSql(SqlMixin, OperationalMetadataMixin, ABC): + """ + Base class for all SQL based source details. + + Attributes: + sqlPath (str): Path to the SQL file. + sqlStatement (str): SQL statement to execute. + + Properties: + rawSql (str): Lazily loaded raw SQL content from the SQL file. + + Methods: + get_sql (str): SQL with substitutions applied. + read_source(config: ReadConfig) -> DataFrame: Read the source using the provided configuration. + """ + def __post_init__(self) -> None: + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.operational_metadata_schema = pipeline_config.get_operational_metadata_schema() + self.pipeline_details = pipeline_config.get_pipeline_details() + self.substitution_manager = pipeline_config.get_substitution_manager() + + def read_source(self, read_config: ReadConfig) -> DataFrame: + """Read the source using the provided configuration.""" + df = self._get_df(read_config) + spark = self.spark + operational_metadata_schema = self.operational_metadata_schema + pipeline_details = self.pipeline_details + features = read_config.features + target_config_flags = read_config.target_config_flags or [] + + # Add operational metadata if needed + if (features.operationalMetadataEnabled + and self.operational_metadata_schema + and TargetConfigFlags.DISABLE_OPERATIONAL_METADATA not in target_config_flags + ): + df = self._add_operational_metadata( + spark, + df, + operational_metadata_schema, + pipeline_details.__dict__ + ) + + # Add quarantine flag + quarantine_rules = read_config.quarantine_rules + if quarantine_rules and quarantine_rules.strip(): + df = df.withColumn( + MetaDataColumnDefs.QUARANTINE_FLAG["name"], + F.expr(quarantine_rules) + ) + + return df + + @abstractmethod + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Get a DataFrame from the source details.""" + pass \ No newline at end of file diff --git a/src/dataflow/sources/batch_files.py b/src/dataflow/sources/batch_files.py new file mode 100644 index 0000000..7b943c8 --- /dev/null +++ b/src/dataflow/sources/batch_files.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass +from typing import Optional + +from pyspark.sql import DataFrame + +from .base import BaseSourceWithSchemaOnRead, ReadConfig + + +@dataclass(kw_only=True) +class SourceBatchFiles(BaseSourceWithSchemaOnRead): + """ + Source details for cloud files. + + Attributes: + format (str): Format of the cloud files. + path (str): Path to the cloud files. + """ + format: str = "csv" + path: Optional[str] = None + + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Use Autoloader to ingest file based sources from cloud storage and returns a DataFrame.""" + source_path = self.path + reader_options = self.readerOptions.copy() + spark = self.spark + logger = self.logger + + logger.debug(f"Batch File Source: Reading {self.format} file from {source_path}") + logger.debug(f"Batch File Source: Reader Config: {read_config}") + + reader = spark.read.format(self.format).options(**reader_options) + df = reader.schema(self.schema_struct).load(source_path) \ + if self.schema_struct else reader.load(source_path) + + return df diff --git a/src/dataflow/sources/cloud_files.py b/src/dataflow/sources/cloud_files.py new file mode 100644 index 0000000..030e888 --- /dev/null +++ b/src/dataflow/sources/cloud_files.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from typing import Optional + +from pyspark.sql import DataFrame + +from .base import BaseSourceWithSchemaOnRead, ReadConfig + + +@dataclass(kw_only=True) +class SourceCloudFiles(BaseSourceWithSchemaOnRead): + """ + Source details for cloud files. + + Attributes: + path (str): Path to the cloud files. + """ + path: Optional[str] = None + + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Use Autoloader to ingest file based sources from cloud storage and returns a DataFrame.""" + source_path = self.path + reader_options = self.readerOptions.copy() + schema = self.schema_struct + spark = self.spark + logger = self.logger + + logger.debug(f"Reading Cloud Files from {source_path}") + logger.debug(f"Reader Config: {read_config}") + + reader = spark.readStream.format("cloudFiles").options(**reader_options) + df = reader.schema(schema).load(source_path) \ + if schema else reader.load(source_path) + + return df diff --git a/src/dataflow/sources/delta.py b/src/dataflow/sources/delta.py new file mode 100644 index 0000000..962c872 --- /dev/null +++ b/src/dataflow/sources/delta.py @@ -0,0 +1,102 @@ +from dataclasses import dataclass +from typing import List, Optional + +from pyspark.sql import DataFrame + +import pipeline_config + +from constants import SystemColumns + +from .base import BaseSourceWithSchemaOnRead, ReadConfig +from ..enums import Mode + + +CDF_CHANGE_TYPE_FILTER_VALUES = ["insert", "update_postimage"] +DLT_SETUP_OPERATION = "DLT SETUP" + + +@dataclass(kw_only=True) +class SourceDelta(BaseSourceWithSchemaOnRead): + """ + Source details for Delta tables. + + Attributes: + database (str): Database name. + table (str): Table name. + cdfEnabled (bool, optional): Flag indicating if CDF is enabled. + tablePath (str, optional): Path to the Delta table. + cdfChangeTypeOverride (List[str], optional): Override default CDF change types. + startingVersionFromDLTSetup (bool, optional): Use table SETUP version as starting point. + """ + database: str + table: str + cdfEnabled: bool = False + tablePath: Optional[str] = None + cdfChangeTypeOverride: Optional[List[str]] = None + startingVersionFromDLTSetup: bool = False + + def _get_starting_version_from_dlt_setup(self, spark, table_name: str) -> int: + """Get the starting version from the latest 'DLT SETUP' operation that was executed on the table.""" + table_name_parts = table_name.split(".") + full_table_name = table_name + if table_name.startswith("live.") or len(table_name_parts) < 3: + pipeline_details = pipeline_config.get_pipeline_details() + database = f"{pipeline_details.pipeline_catalog}.{pipeline_details.pipeline_schema}" + full_table_name = f"{database}.{table_name_parts[-1]}" + self.logger.debug(f"Prepending table name for setting starting version from 'DLT SETUP' operation: {full_table_name}") + + return (spark.sql(f"DESCRIBE HISTORY {full_table_name}") + .filter(f"operation = '{DLT_SETUP_OPERATION}'") + .agg({"version": "max"}) + .collect()[0][0]) + + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Ingest data from a Delta table based on configured options and return a DataFrame.""" + spark = self.spark + logger = self.logger + mode = read_config.mode + uc_enabled = read_config.uc_enabled + + table_name = f"{self.database}.{self.table}" + + logger.debug(f"Setting up reader for Delta table: {table_name}") + logger.debug(f"Reader Config: {read_config}") + + reader_options = self.readerOptions.copy() if self.readerOptions else {} + if self.cdfEnabled: + logger.debug(f"Enabling Change Data Feed: {table_name}") + reader_options["readChangeFeed"] = "true" + + if self.startingVersionFromDLTSetup: + logger.debug(f"Setting starting version from 'DLT SETUP' operation: {table_name}") + + try: + starting_version = self._get_starting_version_from_dlt_setup(spark, table_name) + if starting_version is not None: + reader_options["startingVersion"] = str(starting_version) + logger.debug(f"Setting starting version to latest version 'DLT SETUP' operation was executed: {starting_version}") + except Exception as e: + logger.error(f"Starting version could not be set. Error getting starting version from 'DLT SETUP' operation: {e}") + + logger.debug(f"Setting up reader for: {table_name}") + reader = spark.readStream if mode == Mode.STREAM else spark.read + df = reader.options(**reader_options).table(table_name) if uc_enabled \ + else reader.format("delta").options(**reader_options).load(self.tablePath) + + if self.cdfEnabled: + change_type_filter_values = ( + self.cdfChangeTypeOverride + if self.cdfChangeTypeOverride is not None + else CDF_CHANGE_TYPE_FILTER_VALUES + ) + + # Create a safe SQL IN clause with proper quoting + quoted_values = [f"'{value}'" for value in change_type_filter_values] + cdf_change_type_filter = ( + f"{SystemColumns.CDFColumns.CDF_CHANGE_TYPE.value} IN ({', '.join(quoted_values)})" + ) + logger.debug("Applying CDF filter to table '%s': %s", table_name, cdf_change_type_filter) + + df = df.where(cdf_change_type_filter) + + return df diff --git a/src/dataflow/sources/delta_join.py b/src/dataflow/sources/delta_join.py new file mode 100644 index 0000000..6f4d5d4 --- /dev/null +++ b/src/dataflow/sources/delta_join.py @@ -0,0 +1,112 @@ +from dataclasses import dataclass +from typing import List +import re + +from pyspark.sql import DataFrame +from pyspark.sql import functions as F + +from .base import BaseSource, ReadConfig +from .delta import SourceDelta + + +@dataclass(kw_only=True) +class DeltaTable(SourceDelta): + """ + Source details for Delta tables that need to be joined in stream-stream or stream-static scenarios. + + Attributes: + alias (str): Table alias. + joinMode (str): The type of join if streaming ["stream", "static"]. + """ + alias: str + joinMode: str + + +@dataclass +class DeltaJoin: + """ + Join details of Delta tables that need to be joined. + + Attributes: + joinType (str): Type of join e.g. ["left", "inner"]. + condition (str): Condition expressed in SQL syntax e.g. "a.id = b.id". + + Methods: + get_table_aliases() -> List[str]: Get table aliases from the join condition. + """ + joinType: str + condition: str + + def get_table_aliases(self) -> List[str]: + """Get table aliases from the join condition.""" + pattern = r'(\b\w+)\.' # Matches word characters before a dot + matches = re.findall(pattern, self.condition) + + # not using set to de-dupe as it disrupts the order of the aliases + aliases = [] + for match in matches: + if match not in aliases: # Add alias if not already in the list + aliases.append(match) + return aliases + + +@dataclass(kw_only=True) +class SourceDeltaJoin(BaseSource): + """ + Source details for Delta tables that need to be joined in stream-stream or stream-static scenarios. + + Attributes: + sources (list): List of delta table sources. + joins (list): List of joins. + selectExp (List[str], optional): List of select expressions. + whereClause (List[str], optional): List of WHERE clauses. + + Methods: + get_sources() -> List[SourceDeltaTable]: Get source details for Delta tables. + get_joins() -> List[SourceDeltaJoin]: Get join details for Delta tables. + """ + sources: List + joins: List + + def get_sources(self) -> List[DeltaTable]: + """Get source details for Delta tables.""" + return [DeltaTable(**item) for item in self.sources] + + def get_joins(self) -> List[DeltaJoin]: + """Get join details for Delta tables.""" + return [DeltaJoin(**item) for item in self.joins] + + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Ingest data from a Delta table based on configured options and return a DataFrame.""" + dfs_to_join = {} + for source in self.get_sources(): + read_config.mode = "batch" if source.joinMode == "static" else source.joinMode + df = source.read_source(read_config) + dfs_to_join[source.alias] = df.alias(source.alias) + + final_df = None + used_aliases = set() + for join in self.get_joins(): + aliases = join.get_table_aliases() + + missing_aliases = [alias for alias in aliases if alias not in dfs_to_join] + if missing_aliases: + raise ValueError(f"Missing DataFrames for aliases: {missing_aliases}") + + # Determine DataFrames to join; start with the first two if final_df is not initialized + if final_df is None: + df1, df2 = (dfs_to_join[aliases[0]], dfs_to_join[aliases[1]]) + final_df = df1.join(df2, on=F.expr(join.condition), how=join.joinType) + used_aliases.update(aliases) + else: + # Join each remaining alias that hasn't been used yet + for alias in aliases: + if alias not in used_aliases: + df = dfs_to_join[alias] + final_df = final_df.join(df, on=F.expr(join.condition), how=join.joinType) + used_aliases.add(alias) + + if final_df is None: + raise ValueError("No joins could be performed. Please check the join configuration.") + + return final_df diff --git a/src/dataflow/sources/factory.py b/src/dataflow/sources/factory.py new file mode 100644 index 0000000..7bad560 --- /dev/null +++ b/src/dataflow/sources/factory.py @@ -0,0 +1,85 @@ +from typing import Dict, Type + +from .base import BaseSource +from .batch_files import SourceBatchFiles +from .cloud_files import SourceCloudFiles +from .delta import SourceDelta +from .delta_join import SourceDeltaJoin +from .kafka import SourceKafka +from .python import SourcePython +from .sql import SourceSql +from ..enums import SourceType + + +class SourceFactory: + """Factory for creating BaseSource instances.""" + + # Registry of target types to their corresponding classes + _source_registry: Dict[str, Type[BaseSource]] = { + SourceType.BATCH_FILES: SourceBatchFiles, + SourceType.CLOUD_FILES: SourceCloudFiles, + SourceType.DELTA: SourceDelta, + SourceType.DELTA_JOIN: SourceDeltaJoin, + SourceType.KAFKA: SourceKafka, + SourceType.PYTHON: SourcePython, + SourceType.SQL: SourceSql + } + + @classmethod + def create( + cls, + source_type: str, + source_details: Dict + ) -> BaseSource: + """ + Create a BaseSource instance based on the source type. + + Args: + source_type: The type of source to create + source_details: Configuration dictionary for the source + + Returns: + BaseSource: An instance of the appropriate BaseSource class + + Raises: + ValueError: If source_type is not supported + """ + # Normalize source type + source_type = source_type.lower() + + # Check if source type is supported + if source_type not in cls._source_registry: + supported = ", ".join(cls._source_registry.keys()) + raise ValueError( + f'Unsupported source type "{source_type}". ' + f'Supported types are: {supported}' + ) + + # Get the appropriate source class + source_class = cls._source_registry[source_type] + + # Create and return the source instance + source = source_class(**source_details) + + return source + + @classmethod + def register_source( + cls, + source_type: str, + source_class: Type[BaseSource] + ) -> None: + """ + Register a new source type. + + Args: + source_type: The identifier for the source type + source_class: The class to instantiate for this source type + + Raises: + ValueError: If source_type is already registered + """ + if source_type in cls._source_registry: + raise ValueError(f'Source type "{source_type}" is already registered') + + cls._source_registry[source_type] = source_class \ No newline at end of file diff --git a/src/dataflow/sources/kafka.py b/src/dataflow/sources/kafka.py new file mode 100644 index 0000000..a39a948 --- /dev/null +++ b/src/dataflow/sources/kafka.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass + +from pyspark.sql import DataFrame + +from .base import ReadConfig, BaseSource + + +@dataclass(kw_only=True) +class SourceKafka(BaseSource): + """ + Source details for Kafka. + + Attributes: + topic (str): Kafka topic. + """ + topic: str = None + + def __post_init__(self): + """Post-initialization for Kafka source configuration.""" + BaseSource.__post_init__(self) + self.readerOptions["topic"] = self.topic + + def read_source(self, read_config: ReadConfig) -> DataFrame: + """Get a DataFrame from the source details with applied transformations.""" + df = self._get_df(read_config) + df = self._apply_python_function(df) + df = self._apply_where_clause(df) + df = self._apply_select_exp(df) + + return df + + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Execute a SQL query and retrieves the result as a DataFrame.""" + spark = self.spark + reader_options = self.readerOptions.copy() + logger = self.logger + + logger.debug(f"Reading Kafka topic: {self.topic}") + logger.debug(f"Reader options: {reader_options}") + + return spark.readStream.format("kafka").options(**reader_options).load() diff --git a/src/dataflow/sources/python.py b/src/dataflow/sources/python.py new file mode 100644 index 0000000..d4aeecf --- /dev/null +++ b/src/dataflow/sources/python.py @@ -0,0 +1,111 @@ +from dataclasses import dataclass, field +from typing import Any, Callable, Dict + +from pyspark.sql import DataFrame +import pyspark.sql.functions as F + +from constants import MetaDataColumnDefs, SystemColumns +import pipeline_config +import utility + +from .base import ReadConfig +from ..operational_metadata import OperationalMetadataMixin + + +@dataclass(kw_only=True) +class SourcePython(OperationalMetadataMixin): + """ + Source details for Python Function. + One of functionPath, pythonModule, or pythonFunction must be provided. + + - functionPath: Path to a Python file containing a 'get_df' function + - pythonModule: Module.function reference (e.g., 'transforms.get_customer_data') + The module must be in the extensions directory (added to sys.path) + - pythonFunction: Direct function reference (for internal framework use) + + Attributes: + functionPath (str, optional): Path to the Python function file. + pythonModule (str, optional): Module and function reference (e.g., 'module.function'). + pythonFunction (Callable, optional): Python function (internal use). + tokens (Dict, optional): Tokens to be substituted in the Python function. + + Methods: + read_source(config: ReadConfig) -> DataFrame: + Get a DataFrame from the source details with applied transformations. + """ + functionPath: str = None + pythonModule: str = None + pythonFunction: Callable = None + tokens: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.substitution_manager = pipeline_config.get_substitution_manager() + self.pipeline_details = pipeline_config.get_pipeline_details() + self.operational_metadata_schema = pipeline_config.get_operational_metadata_schema() + + self.tokens = self.substitution_manager.substitute_dict(self.tokens) + + def read_source(self, read_config: ReadConfig) -> DataFrame: + """Read the source using the provided configuration.""" + spark = self.spark + logger = self.logger + pipeline_details = self.pipeline_details + operational_metadata_schema = self.operational_metadata_schema + + logger.debug(f"Function Path: {self.functionPath}") + logger.debug(f"Python Module: {self.pythonModule}") + logger.debug(f"Tokens: {self.tokens}") + logger.debug(f"Read Config: {read_config}") + + # Load the Python function from one of the available sources + if self.pythonModule: + # Load from extension module (e.g., 'transforms.get_customer_data') + logger.debug(f"Loading Python function from module: {self.pythonModule}") + function = utility.load_python_function_from_module( + self.pythonModule, + ["spark", "tokens"] + ) + elif self.functionPath: + # Load from file path + function = utility.load_python_function( + self.functionPath, + "get_df", + ["spark", "tokens"] + ) + elif self.pythonFunction: + # Direct function reference (internal use) + function = self.pythonFunction + else: + raise ValueError( + "One of pythonModule or functionPath must be provided" + ) + + # Get the DataFrame + df = function(spark, self.tokens) + + # Drop cdf columns if present. Important as they are not allowed in the target table. + # If data engineers need them they can alias them in the selectExp + cdf_columns = [column.value for column in SystemColumns.CDFColumns] + df = utility.drop_columns(df, cdf_columns) + + # Add operational metadata if schema is provided + operational_metadata_enabled = read_config.features.operationalMetadataEnabled + if operational_metadata_enabled and operational_metadata_schema: + df = self._add_operational_metadata( + spark, + df, + operational_metadata_schema, + pipeline_details.__dict__ + ) + + # Add quarantine flag + quarantine_rules = read_config.quarantine_rules + if quarantine_rules and quarantine_rules.strip(): + df = df.withColumn( + MetaDataColumnDefs.QUARANTINE_FLAG["name"], + F.expr(quarantine_rules) + ) + + return df diff --git a/src/dataflow/sources/sql.py b/src/dataflow/sources/sql.py new file mode 100644 index 0000000..7b98f67 --- /dev/null +++ b/src/dataflow/sources/sql.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass + +from pyspark.sql import DataFrame + +from .base import BaseSourceSql, ReadConfig + +@dataclass +class SourceSql(BaseSourceSql): + """ + Source details for SQL queries. + """ + def _get_df(self, read_config: ReadConfig) -> DataFrame: + """Execute a SQL query and retrieves the result as a DataFrame.""" + spark = self.spark + logger = self.logger + substitution_manager = self.substitution_manager + + sql = substitution_manager.substitute_string(self.rawSql) + + logger.debug(f"Final SQL Statement: {sql}") + + return spark.sql(sql) diff --git a/src/dataflow/sql.py b/src/dataflow/sql.py new file mode 100644 index 0000000..6d59020 --- /dev/null +++ b/src/dataflow/sql.py @@ -0,0 +1,42 @@ +from dataclasses import dataclass + + +@dataclass +class SqlMixin: + """ + SQL Mixin class that provides SQL path and statement handling. + + Attributes: + sqlPath (str): Path to the SQL file. + sqlStatement (str): SQL statement to execute. + + Properties: + rawSql (str): Lazily loaded raw SQL content from the SQL file. + """ + sqlPath: str = None + sqlStatement: str = None + _sql: str = None + + @property + def rawSql(self) -> str: + """ + Returns the SQL content from either sqlStatement or sqlPath. + If sqlStatement is provided, it is returned directly. + Otherwise, the content from the file at sqlPath is loaded lazily. + """ + if self.sqlStatement: + return self.sqlStatement + if self.sqlPath and self.sqlPath.strip() != "": + if self._sql is None: + try: + with open(self.sqlPath, "r", encoding="utf-8") as f: + sql = f.read() + if sql is None or sql.strip() == "": + raise RuntimeError(f"Sql file empty or error: {self.sqlPath}") + self._sql = sql + except FileNotFoundError as e: + raise FileNotFoundError(f"Error loading sql file: {self.sqlPath} - {e}") from e + else: + raise ValueError("Sql path and sql statement are None or empty.") + return self._sql + diff --git a/src/dataflow/table_import.py b/src/dataflow/table_import.py new file mode 100644 index 0000000..e47c400 --- /dev/null +++ b/src/dataflow/table_import.py @@ -0,0 +1,124 @@ +from pyspark import pipelines as dp +from typing import Any + +from constants import SystemColumns +from dataflow.sources.factory import SourceFactory +import pipeline_config +from pyspark.sql import functions as F + +from .cdc import CDCFlow, CDCSettings +from .dataflow_config import DataFlowConfig +from .enums import Mode, SourceType +from .view import View + + +# TODO: Limited to streaming at the moment, fix for Batch +def create_table_import_flow( + source_details: Any, + target_table_name: str, + cdc_settings: CDCSettings = None, + dataflow_config: DataFlowConfig = None +): + """Create a table import flow.""" + spark = pipeline_config.get_spark() + logger = pipeline_config.get_logger() + logger.info("Creating Run Once Flow: %s", source_details) + + source_details_dict = source_details.copy() + source_details = SourceFactory.create(SourceType.DELTA, source_details_dict) + view_name = f"v_import_{source_details.table}" + flow_name = f"f_import_{source_details.table}" + scd2_columns = [column.value for column in SystemColumns.SCD2Columns] + additional_except_columns = getattr(source_details, 'exceptColumns', []) + + # Create Flows for table being imported + if not cdc_settings: + logger.info(f"Table Import: Append Only: Creating source view: {view_name}") + View( + viewName=view_name, + mode=Mode.BATCH, + sourceType=SourceType.DELTA, + sourceDetails=source_details_dict + ).create_view( + dataflow_config=dataflow_config + ) + + logger.info(f"Table Import: Creating append flow: {flow_name}") + # If not CDC create append flow for table being imported + @dp.append_flow(name=f"f_import_append_{source_details.table}", target=target_table_name, once=True) + def flow_migrate_table(): + return spark.read.table(view_name) + + else: + # If SCD Type 2 + logger.info(f"Table Import: CDC: Creating source view: {view_name}") + View( + viewName=view_name, + mode=Mode.STREAM, + sourceType=SourceType.DELTA, + sourceDetails=source_details_dict + ).create_view( + dataflow_config=dataflow_config + ) + + cdc_settings_with_deletes = None + if cdc_settings.scd_type == "2": + logger.debug(f"Table Import: Handling SCD Type 2") + + is_deleted_column = "is_deleted" + start_at_column = SystemColumns.SCD2Columns.SCD2_START_AT.value + end_at_column = SystemColumns.SCD2Columns.SCD2_END_AT.value + watermark_column = "WATERMARK_COLUMN" + exclude_columns = [is_deleted_column, watermark_column, *scd2_columns] + cdc_settings_with_deletes = CDCSettings( + scd_type=cdc_settings.scd_type, + keys=cdc_settings.keys, + sequence_by=SystemColumns.SCD2Columns.SCD2_START_AT.value, + apply_as_deletes=f"{is_deleted_column} = true", + ignore_null_updates=cdc_settings.ignore_null_updates, + except_column_list= ( + list(set(cdc_settings.except_column_list.copy().extend(*exclude_columns))) + if cdc_settings.except_column_list + else [is_deleted_column, *scd2_columns] + ) + ) + + # Create view for table being imported with deletes + logger.debug(f"Table Import: Creating view to handle soft deletes / closed records: {view_name}") + view_with_deletes_name = f"{view_name}_with_deletes" + @dp.view(name=view_with_deletes_name) + def view_with_deletes(): + # Read stream and add arbitrary watermark to allow grouping and aggregation + df = (spark.readStream.table(view_name) + .withColumn(is_deleted_column, F.lit(False)) + .withColumn(watermark_column, F.lit('2000-01-01').cast("timestamp")) + ) + df_closed_rows = (df + .withWatermark(watermark_column, "10 minutes") + .groupBy( + *cdc_settings_with_deletes.keys, + F.window(watermark_column, "10 minutes") + ) + .agg( + F.max_by(F.struct("*"), start_at_column).alias("max_row") + ) + .select("max_row.*") + .withColumn(start_at_column, F.col(end_at_column)) + .withColumn(is_deleted_column, F.lit(True)) + + .where(F.col(end_at_column).isNotNull()) + ) + + return df.unionAll(df_closed_rows).drop(watermark_column) + + CDCFlow( + cdc_settings_with_deletes + if cdc_settings_with_deletes + else cdc_settings + ).create( + target_table=target_table_name, + source_view_name=view_name if cdc_settings.scd_type != "2" else view_with_deletes_name, + flow_name=flow_name, + additional_except_columns=additional_except_columns, + run_once=True + ) diff --git a/src/dataflow/table_migration.py b/src/dataflow/table_migration.py new file mode 100644 index 0000000..711172e --- /dev/null +++ b/src/dataflow/table_migration.py @@ -0,0 +1,383 @@ +from dataclasses import dataclass +import json +from typing import Dict + +from pyspark.sql import DataFrame +from pyspark.sql import functions as F +from pyspark.sql import types as T +from pyspark.sql.utils import AnalysisException + +import pipeline_config +import utility + +from .cdc import CDCSettings +from .dataflow_config import DataFlowConfig +from .dataflow_spec import DataflowSpec +from .sources import SourceDelta +from .table_import import create_table_import_flow + + +@dataclass +class TableMigrationDetails: + """ + Migration details structure to move data into a Spark Declarative Pipeline. + + Attributes: + enabled (bool): Flag indicating if migration is enabled. + catalogType (str): Type of catalog. + sourceDetails (Dict): Source details for migration. + + Methods: + get_source_details() -> SourceDelta: Get source details for migration. + """ + enabled: bool + catalogType: str + sourceDetails: Dict + autoStartingVersionsEnabled: bool = True + + def get_source_details(self) -> SourceDelta: + """Get source details for migration.""" + return SourceDelta(**self.sourceDetails) + + +class TableMigrationManager: + """ + Manage Delta Table Migration. + + Attributes: + dataflow_spec (DataflowSpec): Dataflow specification. + target_database (str): Target database. + target_table_name (str): Target table name. + cdc_settings (CDCSettings): The CDC Settings. + dataflow_config (DataFlowConfig): Dataflow configuration. + + Methods: + initialize_state(views: Dict[str, str]) -> None: Initialize table migration state. + create_flow() -> None: Create table migration flow. + set_view_starting_versions(views: Dict[str, View]) -> Dict[str, View]: Set view starting versions. + """ + def __init__( + self, + dataflow_spec: DataflowSpec, + target_database: str, + target_table_name: str, + cdc_settings: CDCSettings = None, + dataflow_config: DataFlowConfig = None, + ): + self.dataflow_spec = dataflow_spec + self.target_database = target_database + self.target_table_name = target_table_name + self.cdc_settings = cdc_settings + self.dataflow_config = dataflow_config + + self.spark = pipeline_config.get_spark() + self.dbutils = pipeline_config.get_dbutils() + self.logger = pipeline_config.get_logger() + self.operational_metadata_schema = pipeline_config.get_operational_metadata_schema() + self.pipeline_details = pipeline_config.get_pipeline_details() + self.substitution_manager = pipeline_config.get_substitution_manager() + + # Initialize table migration details + self.table_migration_details = TableMigrationDetails( + **self.dataflow_spec.tableMigrationDetails + ) if self.dataflow_spec.tableMigrationDetails else None + + self.table_migration_enabled = ( + self.table_migration_details.enabled + if self.table_migration_details else False + ) + + self.logger.info(f"Table Migration enabled for table: {self.target_table_name} - {self.table_migration_enabled}") + + # Initialize auto starting versions enabled + self.auto_starting_versions_enabled = ( + self.table_migration_details.autoStartingVersionsEnabled + if self.table_migration_details else False # Default to True if not specified + ) + + # Initialize delta source views (empty if migration disabled) + self.delta_source_views = ( + {view.viewName: view for view in self.dataflow_spec.get_all_delta_source_views().values()} + if self.table_migration_enabled else {} + ) + + # Initialize checkpoint state variables + self.checkpoint_state_volume_root_path = None + self.checkpoint_state_initial_versions_path = None + self.checkpoint_state_tracking_path = None + self.checkpoint_state_table_schema = None + self.table_checkpoint_versions = {} + + # Initialize state (method handles migration disabled case internally) + self._initialize_state() + + def _initialize_state(self): + """Set up checkpoint state based on the migration details.""" + if not self.table_migration_enabled or not self.auto_starting_versions_enabled: + self.logger.info( + "Table migration disabled, skipping state initialization" if not self.table_migration_enabled + else "Table migration enabled, but auto starting version management is disabled. Skipping state initialization" + ) + return + + self.checkpoint_state_volume_root_path = pipeline_config.get_table_migration_state_volume_path() + self.checkpoint_state_initial_versions_path = f"{self.checkpoint_state_volume_root_path}/initial_versions" + self.checkpoint_state_tracking_path = f"{self.checkpoint_state_volume_root_path}/tracking" + + self.logger.info( + f"Table Migration - Paths:\n" + f" - root: {self.checkpoint_state_volume_root_path}\n" + f" - initial versions: {self.checkpoint_state_initial_versions_path}\n" + f" - tracking: {self.checkpoint_state_tracking_path}" + ) + + self.checkpoint_state_initial_versions_schema = T.StructType([ + T.StructField("pipelineId", T.StringType(), False), + T.StructField("targetTable", T.StringType(), False), + T.StructField("tableName", T.StringType(), False), + T.StructField("viewName", T.StringType(), False), + T.StructField("initialVersion", T.IntegerType(), False) + ]) + self.checkpoint_state_tracking_schema = T.StructType([ + T.StructField("pipelineId", T.StringType(), False), + T.StructField("targetTable", T.StringType(), False), + T.StructField("tableName", T.StringType(), False), + T.StructField("viewName", T.StringType(), False), + T.StructField("version", T.IntegerType(), False), + T.StructField("currentVersion", T.IntegerType(), True), + T.StructField("ready", T.BooleanType(), False) + ]) + + self._track_checkpoint_state() + self._set_view_starting_versions() + + self.logger.debug(f"Table Migration - table: {self.target_table_name}, final dataflow spec: {json.dumps(self.dataflow_spec.__dict__, indent=4)}") + + def create_flow(self): + """Set up table migration flow based on the migration details.""" + if self.table_migration_enabled: + self.logger.info("Table Migration Setup...") + create_table_import_flow( + source_details=self.table_migration_details.sourceDetails, + target_table_name=self.target_table_name, + cdc_settings=self.cdc_settings, + dataflow_config=self.dataflow_config + ) + else: + self.logger.info("Table Migration is disabled, skipping flow creation") + + def _track_checkpoint_state(self): + """Track the table migration checkpoint state.""" + self.logger.info(f"Table Migration - table: {self.target_table_name}, tracking delta table state") + + if self.delta_source_views: + self.logger.debug(f"Table Migration - table: {self.target_table_name}," + f" delta views: {list(self.delta_source_views.keys())}" + ) + + # check if checkpoint state store exists + initial_versions_store_exists = self._state_store_exists(self.checkpoint_state_initial_versions_path) + tracking_store_exists = self._state_store_exists(self.checkpoint_state_tracking_path) + + self.logger.debug(f"Table Migration - table: {self.target_table_name}," + f" initial versions store exists - {initial_versions_store_exists}\n" + f" tracking store exists - {tracking_store_exists}" + ) + + # get initial source table versions + initial_versions_df = None + if not initial_versions_store_exists: + self.logger.debug(f"Table Migration - table: {self.target_table_name}," + f" initial versions store does not exist, getting initial source table versions" + ) + initial_versions_df = self._get_source_table_versions() + self._write_state_store(initial_versions_df, self.checkpoint_state_initial_versions_path) + else: + self.logger.debug(f"Table Migration - table: {self.target_table_name}," + f" initial versions store exists, reading initial source table versions" + ) + initial_versions_df = self._read_state_store( + path=self.checkpoint_state_initial_versions_path, + schema=self.checkpoint_state_initial_versions_schema + ) + + # TODO: remove this once final defect with Auto CDC and group by is tested and this is no longer needed + # if initial_versions_df.count() < 1: + # self.logger.debug(f"Table Migration - table: {self.target_table_name}," + # f" initial versions df is empty, getting initial source table versions" + # ) + # initial_versions_df = self._get_source_table_versions() + # self._write_state_store(initial_versions_df, self.checkpoint_state_initial_versions_path) + + # Get and save migration state + checkpoint_state_df = self.get_migration_state(initial_versions_df, tracking_store_exists) + self.table_checkpoint_versions = { + row.viewName: { + "tableName": row.tableName, + "version": row.version, + "currentVersion": row.currentVersion, + "ready": row.ready, + } + for row in checkpoint_state_df.collect() + } + + self._write_state_store(checkpoint_state_df, self.checkpoint_state_tracking_path) + + def _get_source_table_versions(self) -> DataFrame: + """Get the initial source table versions.""" + self.logger.debug( + f"Table Migration - table: {self.target_table_name}," + f" getting initial source table versions for - {list(self.delta_source_views.keys())}" + ) + delta_views = { + view_name: f"{view.get_source_details().database}.{view.get_source_details().table}" + for view_name, view in self.delta_source_views.items() + } + self.logger.debug(f"Table Migration - table: {self.target_table_name}, delta views dict: {json.dumps(delta_views, indent=4)}") + initial_versions_df = utility.get_table_versions(self.spark, delta_views) + initial_versions_df = (initial_versions_df + .withColumn("pipelineId", F.lit(self.pipeline_details.pipeline_id)) + .withColumn("targetTable", F.lit(self.target_table_name)) + .withColumn("initialVersion", F.col("version")) + .select("pipelineId", "targetTable", "tableName", "viewName", "initialVersion")) + + return initial_versions_df + + def get_migration_state(self, initial_versions_df: DataFrame, checkpoint_state_store_exists: bool,) -> DataFrame: + """Get the migration state.""" + self.logger.debug( + f"Table Migration - table: {self.target_table_name} getting migration state." + ) + + if initial_versions_df.count() < 1: + msg = f"Table Migration - table: {self.target_table_name}, initial versions not stored yet" + self.logger.error(msg) + raise RuntimeError(msg) + + self.logger.debug("Table Migration - table: %s, initial versions df:\n%s", self.target_table_name, initial_versions_df.show(truncate=False)) + + checkpoint_state_df = None + if checkpoint_state_store_exists: + self.logger.debug(f"Table Migration - table: {self.target_table_name}, checkpoint state store exists, reading checkpoint state") + checkpoint_state_df = self._read_state_store( + path=self.checkpoint_state_tracking_path, + schema=self.checkpoint_state_tracking_schema + ) + checkpoint_state_df.show() + checkpoint_state_df = initial_versions_df.alias("initial").join( + checkpoint_state_df.alias("checkpoint"), + on=["pipelineId", "targetTable", "viewName"], + how="left" + ).selectExpr( + "COALESCE(initial.pipelineId, checkpoint.pipelineId) AS pipelineId", + "COALESCE(initial.targetTable, checkpoint.targetTable) AS targetTable", + "COALESCE(initial.tableName, checkpoint.tableName) AS tableName", + "initial.viewName", + "initial.initialVersion AS version", + "COALESCE(checkpoint.currentVersion, initial.initialVersion) AS currentVersion", + "COALESCE(false, checkpoint.ready) AS ready" + ) + checkpoint_state_df.show() + else: + self.logger.debug( + f"Table Migration - table: {self.target_table_name}," + f" checkpoint state store does not exist, creating initial checkpoint state" + ) + checkpoint_state_df = initial_versions_df.selectExpr( + "pipelineId", + "targetTable", + "tableName", + "viewName", + "initialVersion AS version", + "initialVersion AS currentVersion", + "false AS ready" + ) + + # get delta views and tables from checkpoint state table + delta_views = {row.viewName: row.tableName for row in checkpoint_state_df.where("ready = false").collect()} + if delta_views: + # get current table versions from source delta tables + self.logger.debug(f"Table Migration - table: {self.target_table_name}, delta views dict: {json.dumps(delta_views, indent=4)}") + current_table_versions_df = utility.get_table_versions(self.spark, delta_views) + + # join checkpoint state df with current table versions df + checkpoint_state_df = ( + checkpoint_state_df.alias("checkpoint") + .join(current_table_versions_df.alias("current"), on=["viewName"], how="left") + .selectExpr( + "pipelineId", + "targetTable", + "checkpoint.tableName", + "checkpoint.viewName", + "checkpoint.version", + "COALESCE(current.version, checkpoint.currentVersion) AS currentVersion", + "COALESCE(current.version > checkpoint.version, checkpoint.ready) AS ready" + ) + ) + + self.logger.debug("Table Migration - table: %s, checkpoint state df:\n%s", self.target_table_name, checkpoint_state_df.show(truncate=False)) + + return checkpoint_state_df + + def _write_state_store(self, checkpoint_state_df: DataFrame, path: str): + """Write the checkpoint state df to the checkpoint state table.""" + pipeline_id = self.pipeline_details.pipeline_id + options = { + "header": "true", + "replaceWhere": f"pipelineId = '{pipeline_id}' AND targetTable = '{self.target_table_name}'" + } + (checkpoint_state_df.write.format("csv") + .options(**options) + .mode("overwrite") + .partitionBy("pipelineId", "targetTable") + .save(path) + ) + + def _read_state_store(self, path: str, schema: T.StructType = None) -> DataFrame: + """Read the checkpoint state df from the checkpoint state table.""" + pipeline_id = self.pipeline_details.pipeline_id + target_table = self.target_table_name + self.logger.debug(f"Table Migration - table: {self.target_table_name}, reading state from - {path}") + + reader = self.spark.read.format("csv").option("header", "true") + reader = reader.schema(schema) if schema else reader + return reader.load(path).where(f"pipelineId = '{pipeline_id}' AND targetTable = '{target_table}'") + + def _state_store_exists(self, path: str) -> bool: + """Check if the checkpoint state table exists.""" + try: + df = self._read_state_store(path) + return df.count() > 0 + except (Exception): + self.logger.debug(f"Table Migration - table: {self.target_table_name}, path does not exist yet - {path}") + return False + + def _set_view_starting_versions(self): + """ + Sets the appropriate starting version for the view. If the checkpoint state is not ready, + a where clause will be added, preventing rows from being returned + """ + spec = self.dataflow_spec + for flow_group in spec.flowGroups: + for flow in flow_group.get("flows", {}).values(): + for view_name, view in flow.get("views", {}).items(): + checkpoint_state = self.table_checkpoint_versions.get(view_name, None) + self.logger.debug( + f"Table Migration - table: {self.target_table_name}, view: {view_name}, " + f"checkpoint state to be applied: {json.dumps(checkpoint_state, indent=4)}" + ) + if checkpoint_state: + source_details = view.get("sourceDetails") + if checkpoint_state["ready"]: + version = int(checkpoint_state["version"]) + 1 + if "readerOptions" in source_details: + option = {"startingVersion": version} + source_details["readerOptions"].update(option) + self.logger.debug(f"Table Migration - table: {self.target_table_name}, view: {view_name}, reader options updated - {option}") + else: + option = {"startingVersion": version} + source_details["readerOptions"] = option + self.logger.debug(f"Table Migration - table: {self.target_table_name}, view: {view_name}, reader options set - {option}") + else: + source_details["whereClause"] = ["1=0"] + self.logger.debug(f"Table Migration - table: {self.target_table_name}, view: {view_name}, where clause set to 1=0") \ No newline at end of file diff --git a/src/dataflow/targets/__init__.py b/src/dataflow/targets/__init__.py new file mode 100644 index 0000000..3465ad1 --- /dev/null +++ b/src/dataflow/targets/__init__.py @@ -0,0 +1,19 @@ +from .delta_materialized_view import TargetDeltaMaterializedView +from .delta_streaming_table import TargetDeltaStreamingTable +from .sink_delta import TargetDeltaSink +from .sink_foreach_batch import TargetForEachBatchSink +from .sink_custom_python import TargetCustomPythonSink +from .sink_kafka import TargetKafkaSink +from .staging_table import StagingTable +from .factory import TargetFactory + +__all__ = [ + 'TargetDeltaMaterializedView', + 'TargetDeltaStreamingTable', + 'TargetDeltaSink', + 'TargetForEachBatchSink', + 'TargetKafkaSink', + 'TargetCustomPythonSink', + 'TargetFactory', + 'StagingTable', +] diff --git a/src/dataflow/targets/base.py b/src/dataflow/targets/base.py new file mode 100644 index 0000000..0687a80 --- /dev/null +++ b/src/dataflow/targets/base.py @@ -0,0 +1,337 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, List, Dict, Optional, Union, TypeVar +import os + +from pyspark import pipelines as dp +import pyspark.sql.types as T + +import pipeline_config +import utility + +from ..enums import TableType, TargetConfigFlags + +Self = TypeVar("Self", bound="BaseTargetDelta") + + +CONSTRAINT_KEY_WORDS = ("CONSTRAINT ", "PRIMARY KEY ", "FOREIGN KEY ") + + +@dataclass(kw_only=True) +class BaseTargetDelta(): + """ + Target details structure for Delta targets. + + Attributes: + table (str): Table name. + database (str, optional): Database name. + type (str, optional): Type of table ["st", "mv"]. Defaults to "st". + tableProperties (Dict, optional): Properties of the target table. + partitionColumns (List[str], optional): List of partition columns. + clusterByColumns (List[str], optional): List of cluster by columns. + clusterByAuto (bool, optional): Whether to enable cluster by auto. + schemaPath (str, optional): Path to the schema file (JSON or DDL format). + tablePath (str, optional): Path to the Delta table. + comment (str, optional): Comment for the target table. + sparkConf (Dict, optional): Spark configuration for the target table. + rowFilter (str, optional): Row filter for the target table. + private (bool, optional): Whether the target table is private. + + Properties: + schema_type (str): Type of schema ["json", "ddl"]. + schema (Union[Dict, str]): Schema structure. + schema_json (Dict): Schema JSON. + schema_struct (StructType): Schema structure. + schema_ddl (str): Schema DDL. + + Methods: + add_columns: Add columns to the target schema. + remove_columns: Remove columns from the target schema. + add_table_properties: Add table properties to the target details. + """ + table: str + database: Optional[str] = None + type: str = "st" + tableProperties: Dict = field(default_factory=dict) + partitionColumns: List[str] = field(default_factory=list) + clusterByColumns: List[str] = None #Must be passed as None as API expects None or a list with at least one column. [] causes an error + clusterByAuto: bool = False + schemaPath: Optional[str] = None + tablePath: Optional[str] = None + configFlags: List[str] = field(default_factory=list) + comment: Optional[str] = None + sparkConf: Optional[Dict] = None + rowFilter: Optional[str] = None + private: Optional[bool] = None + _schema_type: str = None + _schema_json: Dict = field(default_factory=dict) + _schema_ddl: str = None + _schema_struct: T.StructType = field(default=None, init=False) + _schema_lines: List[str] = field(default_factory=list) + _schema_constraints: List[str] = field(default_factory=list) + + def __post_init__(self): + """Initialize the target details and validate the configuration.""" + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.mandatory_table_properties = pipeline_config.get_mandatory_table_properties() + self.operational_metadata_schema = pipeline_config.get_operational_metadata_schema() + self.pipeline_details = pipeline_config.get_pipeline_details() + self.substitution_manager = pipeline_config.get_substitution_manager() + + self.tableProperties = utility.merge_dicts_recursively( + self.mandatory_table_properties, + self.tableProperties + ) + if TargetConfigFlags.DISABLE_OPERATIONAL_METADATA in self.configFlags: + self.operational_metadata_schema = None + + if self.database: + self.table = f"{self.database}.{self.table}" + + # Validate table type + self.type = self.type.lower() + try: + TableType(self.type) + except ValueError as e: + raise ValueError( + f"Invalid table type: {self.type}. Must be one of {[t.value for t in TableType]}" + ) from e + + # Validate partition and cluster columns + if self.partitionColumns and self.clusterByColumns: + raise ValueError("Cannot specify both partitionColumns and clusterByColumns") + + if self.partitionColumns and self.clusterByAuto: + raise ValueError("Cannot specify partitionColumns and enable clusterByAuto") + + # Initialize schema if path is provided + if self.schemaPath and self.schemaPath.strip() != "": + self._initialize_schema() + + def _initialize_schema(self) -> None: + """Initialize the schema from the schema path.""" + # Get schema type + file_extension = os.path.splitext(self.schemaPath)[1].lower() + if file_extension not in ['.json', '.ddl']: + raise ValueError(f"Unsupported schema file extension: {file_extension}. Only .json and .ddl are supported.") + + # Set schema type + self._schema_type = file_extension[1:] + + # Get schema + if file_extension == '.json': + self._schema_json = utility.get_json_from_file(self.schemaPath) + self._schema_struct = T.StructType.fromJson(self._schema_json) + if not isinstance(self._schema_json, dict): + raise ValueError(f"Invalid JSON schema format in {self.schemaPath}") + elif file_extension == '.ddl': + with open(self.schemaPath, 'r', encoding='utf-8') as f: + self._schema_ddl = f.read() + + # Parse schema + schema_lines = self._schema_ddl.split("\n") + schema_lines = [line.strip().rstrip(",") for line in schema_lines] + schema_lines = [line for line in schema_lines if not line.strip().startswith("--")] + + # Parse constraints + schema_constraints = [line for line in schema_lines if line.strip().startswith(CONSTRAINT_KEY_WORDS)] + schema_lines = [line for line in schema_lines if not line.strip().startswith(CONSTRAINT_KEY_WORDS)] + + self._schema_lines = schema_lines + self._schema_constraints = schema_constraints + + # Initialize operational metadata schema + if self.operational_metadata_schema: + self.logger.info(f"Adding operational metadata schema to table: {self.table}") + self._add_columns(self.operational_metadata_schema.fields) + + @property + def schema_type(self) -> Optional[str]: + """Get the schema type.""" + return self._schema_type + + @property + def schema(self) -> Union[T.StructType, str]: + """Get the schema.""" + if self._schema_type == "json": + return self.schema_json + elif self._schema_type == "ddl": + return self.schema_ddl + + @property + def schema_json(self) -> Dict: + """Get the schema from the schema path.""" + return self._schema_struct.jsonValue() + + @property + def schema_struct(self) -> T.StructType: + """Get the schema struct from the schema path.""" + return self._schema_struct + + @property + def schema_ddl(self) -> str: + """Get the schema from the schema path.""" + schema_lines = self._schema_lines + self._schema_constraints + return ",\n".join(schema_lines) + + def add_columns(self, columns: Union[List[T.StructField], List[Dict]]) -> Self: + """ + Add columns to the target schema. + + Args: + columns (Union[List[T.StructField], List[Dict]]): List of columns as StructFields or Dicts + + Returns: + Self: The updated TargetDelta instance + """ + self._add_columns(columns) + return self + + def _add_columns(self, columns: Union[List[T.StructField], List[Dict]]): + """Add columns to the target schema.""" + if not self._schema_struct and not self._schema_lines: + raise ValueError( + f"Attempting to add columns to table: {self.table} but schema structure is not initialized.") + + for column in columns: + column = T.StructField.fromJson(column) if isinstance(column, dict) else column + if not isinstance(column, T.StructField): + raise ValueError(f"Unsupported column format: {type(column)}. Must be Dict or StructField.") + + if self.schema_type == "json": + if column.name not in self._schema_struct.fieldNames(): + self._schema_struct = self._schema_struct.add(column) + elif self.schema_type == "ddl": + if column.name not in self._schema_lines: + self._schema_lines.append(column.simpleString().replace(":", " ")) + + def remove_columns(self, column_names: List[str]) -> Self: + """ + Remove columns from the target schema. + + Args: + column_names (List[str]): List of column names to remove. + + Returns: + Self: The updated TargetDelta instance + """ + self._remove_columns(column_names) + return self + + def _remove_columns(self, column_names: List[str]): + """Remove columns from the target schema.""" + if not self._schema_struct and not self._schema_lines: + raise ValueError("Schema structure is not initialized.") + + if self.schema_type == "json": + self._schema_struct = T.StructType([ + field for field in self._schema_struct.fields + if field.name not in column_names + ]) + elif self.schema_type == "ddl": + self._schema_lines = [ + line for line in self._schema_lines + if line.strip().split(" ")[0] not in column_names + ] + + def add_table_properties(self, table_properties: Dict) -> Self: + """ + Add table properties to the target details. + + Args: + table_properties (Dict): Dictionary containing table properties to add. + + Returns: + Self: The updated TargetDelta instance + """ + self.tableProperties = utility.merge_dicts_recursively(self.tableProperties, table_properties) + return self + + def create_table( + self, + expectations: Dict = None + ) -> None: + """ + Create the target table for the data flow. + + Args: + expectations: Optional dictionary containing: + - expect_all: Rules that log violations + - expect_all_or_drop: Rules that drop violating records + - expect_all_or_fail: Rules that fail the pipeline on violations + """ + logger = self.logger + substitution_manager = self.substitution_manager + + logger.info(f"Creating Delta Table: {self.table}, Type: {self.type}") + + schema = None + if self.schema_type == "json": + schema = self.schema_struct + elif self.schema_type == "ddl": + if substitution_manager: + schema = substitution_manager.substitute_string(self.schema) + + logger.debug(f"Schema Type: {self.schema_type}, Schema for {self.table}: {schema}") + logger.debug(f"Expectations: {self.table}, {expectations}") + logger.debug(f"Config Flags: {self.configFlags}") + + self._create_table(schema, expectations) + + @abstractmethod + def _create_table( + self, + schema: T.StructType | str, + expectations: Dict = None + ) -> None: + """Abstract implementation for target specific table creation logic.""" + pass + + +class BaseSink(ABC): + """Base class for all sink types. + + This abstract base class defines the interface that all sink implementations must follow. + A sink represents a destination where data can be written to, such as Kafka topics, + or Delta tables. + + Attributes: + configFlags: List of config flags for the sink. + + Methods: + create_sink: Create a sink with the specified name, type, and options. + + Properties: + get_name (str): The unique identifier or name of the sink. + get_type (str): The type of the sink (e.g., 'kafka', 'delta'). + get_options (Dict[str, Any]): Configuration options specific to the sink type. + """ + configFlags: List[str] = [] + + def __init__(self): + self.spark = pipeline_config.get_spark() + self.logger = pipeline_config.get_logger() + self.substitution_manager = pipeline_config.get_substitution_manager() + + @property + @abstractmethod + def sink_name(self) -> str: + """Returns the name of the sink.""" + + @property + @abstractmethod + def sink_type(self) -> str: + """Returns the type of the sink.""" + + @property + @abstractmethod + def sink_options(self) -> Dict[str, Any]: + """Returns the options for the sink configuration.""" + + def create_sink(self) -> None: + """Create a sink with the specified name, type, and options.""" + logger = self.logger + logger.info(f"Creating Sink: {self.sink_name}, Type: {self.sink_type}") + logger.info(f"Sink Options: {self.sink_options}") + dp.create_sink(self.sink_name, self.sink_type, self.sink_options) diff --git a/src/dataflow/targets/delta_materialized_view.py b/src/dataflow/targets/delta_materialized_view.py new file mode 100644 index 0000000..fab50cf --- /dev/null +++ b/src/dataflow/targets/delta_materialized_view.py @@ -0,0 +1,109 @@ +from dataclasses import dataclass +from typing import Dict, Optional + +from pyspark import pipelines as dp +from pyspark.sql import types as T + +from ..operational_metadata import OperationalMetadataMixin +from ..sql import SqlMixin + +from .base import BaseTargetDelta + + +@dataclass(kw_only=True) +class TargetDeltaMaterializedView(BaseTargetDelta, SqlMixin, OperationalMetadataMixin): + """ + Target details structure for Delta targets. + + Attributes: + table (str): Table name. + type (str, optional): Type of table ["st", "mv"]. Defaults to "st". + tableProperties (Dict, optional): Properties of the target table. + partitionColumns (List[str], optional): List of partition columns. + clusterByColumns (List[str], optional): List of cluster by columns. + clusterByAuto (bool, optional): Whether to enable cluster by auto. + schemaPath (str, optional): Path to the schema file (JSON or DDL format). + tablePath (str, optional): Path to the Delta table. + sourceView (str, optional): Source view name. + sqlPath (str, optional): Path to the SQL file. + sqlStatement (str, optional): SQL statement. + rowFilter (str, optional): Row filter for the target table. + sparkConf (Dict, optional): Spark configuration for the target table. + + Properties: + schema_type (str): Type of schema ["json", "ddl"]. + schema (Union[Dict, str]): Schema structure. + schema_json (Dict): Schema JSON. + schema_struct (StructType): Schema structure. + schema_ddl (str): Schema DDL. + rawSql (str): Raw SQL statement. + + Methods: + add_columns: Add columns to the target schema. + add_table_properties: Add table properties to the target details. + create_table: Create the target table for the data flow. + get_sql (str): SQL with substitutions applied. + remove_columns: Remove columns from the target schema. + """ + sourceView: Optional[str] = None + + def _create_table( + self, + schema: T.StructType | str, + expectations: Dict = None + ) -> None: + """Create the target table for the data flow.""" + spark = self.spark + logger = self.logger + operational_metadata_schema = self.operational_metadata_schema + pipeline_details = self.pipeline_details + substitution_manager = self.substitution_manager + + msg = ( + f"SQL Settings for MV: {self.table}\n" + f"Source View: {self.sourceView}\n" + f"SQL Path: {self.sqlPath}\n" + f"SQL Statement: {self.sqlStatement}\n" + ) + logger.debug(msg) + + if not self.sourceView and not self.sqlPath and not self.sqlStatement: + raise ValueError( + "Error: sourceView or sqlPath or sql Statement must be set when creating a Materialized View" + ) + + sql = None + if self.sourceView: + sql = f"SELECT * FROM live.{self.sourceView}" + elif self.rawSql: + sql = substitution_manager.substitute_string(self.rawSql) + + @dp.table( + name=self.table, + comment=self.comment, + spark_conf=self.sparkConf, + row_filter=self.rowFilter, + path=self.tablePath, + schema=schema, + table_properties=self.tableProperties, + partition_cols=self.partitionColumns, + cluster_by=self.clusterByColumns, + cluster_by_auto=self.clusterByAuto, + private=self.private + ) + @dp.expect_all(expectations.get("expect_all", {}) if expectations else {}) + @dp.expect_all_or_drop(expectations.get("expect_all_or_drop", {}) if expectations else {}) + @dp.expect_all_or_fail(expectations.get("expect_all_or_fail", {}) if expectations else {}) + def mv_query(): + df = spark.sql(sql) + + # Add operational metadata if needed + if operational_metadata_schema: + df = self._add_operational_metadata( + spark, + df, + operational_metadata_schema, + pipeline_details.__dict__ + ) + + return df diff --git a/src/dataflow/targets/delta_streaming_table.py b/src/dataflow/targets/delta_streaming_table.py new file mode 100644 index 0000000..cf41994 --- /dev/null +++ b/src/dataflow/targets/delta_streaming_table.py @@ -0,0 +1,60 @@ +from dataclasses import dataclass +from typing import Dict + +from pyspark import pipelines as dp +from pyspark.sql import types as T + +from .base import BaseTargetDelta + + +@dataclass(kw_only=True) +class TargetDeltaStreamingTable(BaseTargetDelta): + """ + Target details structure for Delta targets. + + Attributes: + table (str): Table name. + type (str, optional): Type of table ["st", "mv"]. Defaults to "st". + tableProperties (Dict, optional): Properties of the target table. + partitionColumns (List[str], optional): List of partition columns. + clusterByColumns (List[str], optional): List of cluster by columns. + clusterByAuto (bool, optional): Whether to enable cluster by auto. + schemaPath (str, optional): Path to the schema file (JSON or DDL format). + tablePath (str, optional): Path to the Delta table. + rowFilter (str, optional): Row filter for the target table. + sparkConf (Dict, optional): Spark configuration for the target table. + + Properties: + schema_type (str): Type of schema ["json", "ddl"]. + schema (Union[Dict, str]): Schema structure. + schema_json (Dict): Schema JSON. + schema_struct (StructType): Schema structure. + schema_ddl (str): Schema DDL. + + Methods: + add_columns: Add columns to the target schema. + remove_columns: Remove columns from the target schema. + add_table_properties: Add table properties to the target details. + create_table: Create the target table for the data flow. + """ + def _create_table( + self, + schema: T.StructType | str, + expectations: Dict = None + ) -> None: + """Create the target table for the data flow.""" + dp.create_streaming_table( + name=self.table, + comment=self.comment, + spark_conf=self.sparkConf, + row_filter=self.rowFilter, + table_properties=self.tableProperties, + partition_cols=self.partitionColumns, + cluster_by=self.clusterByColumns, + cluster_by_auto=self.clusterByAuto, + path=self.tablePath, + schema=schema, + expect_all=expectations.get("expect_all") if expectations else None, + expect_all_or_drop=expectations.get("expect_all_or_drop") if expectations else None, + expect_all_or_fail=expectations.get("expect_all_or_fail") if expectations else None + ) diff --git a/src/dataflow/targets/factory.py b/src/dataflow/targets/factory.py new file mode 100644 index 0000000..877eb48 --- /dev/null +++ b/src/dataflow/targets/factory.py @@ -0,0 +1,82 @@ +from typing import Dict, Type, Any + +from .delta_materialized_view import TargetDeltaMaterializedView +from .delta_streaming_table import TargetDeltaStreamingTable +from .sink_delta import TargetDeltaSink +from .sink_kafka import TargetKafkaSink +from .sink_foreach_batch import TargetForEachBatchSink +from .sink_custom_python import TargetCustomPythonSink +from ..enums import TargetType, TableType + +class TargetFactory: + """Factory for creating BaseTarget instances.""" + + # Registry of target types to their corresponding classes + _registry: Dict[str, Dict[str, Type[Any]]] = { + TargetType.DELTA: { + TableType.MATERIALIZED_VIEW: TargetDeltaMaterializedView, + TableType.STREAMING: TargetDeltaStreamingTable + }, + TargetType.DELTA_SINK: { + None: TargetDeltaSink + }, + TargetType.KAFKA_SINK: { + None: TargetKafkaSink + }, + TargetType.FOREACH_BATCH_SINK: { + None: TargetForEachBatchSink + }, + TargetType.CUSTOM_PYTHON_SINK: { + None: TargetCustomPythonSink + } + } + + @classmethod + def create( + cls, + target_type: str, + target_details: Dict + ) -> Any: + """ + Create a Target instance based on the target format. + + Args: + target_type: The type of target to create + target_details: Configuration dictionary for the target + Returns: + Any: An instance of the appropriate Target class + + Raises: + ValueError: If target_type is not supported + """ + # Normalize target format + target_type = target_type.lower() + + if target_type not in cls._registry: + supported = ", ".join(cls._registry.keys()) + raise ValueError( + f'Unsupported target type "{target_type}". ' + f'Supported types are: {supported}' + ) + + target_subtypes = cls._registry[target_type] + + if target_type == TargetType.DELTA: + table_type = target_details.get('type', None) + if not table_type: + raise ValueError("Table type must be specified for Delta targets") + + table_type = table_type.lower() + if table_type not in target_subtypes: + supported = ", ".join(target_subtypes.keys()) + raise ValueError( + f'Unsupported table type "{table_type}" for Delta target. ' + f'Supported types are: {supported}' + ) + + target_class = target_subtypes[table_type] + else: + # For non-Delta targets, use the None key + target_class = target_subtypes[None] + + return target_class(**target_details) diff --git a/src/dataflow/targets/sink_custom_python.py b/src/dataflow/targets/sink_custom_python.py new file mode 100644 index 0000000..b76f623 --- /dev/null +++ b/src/dataflow/targets/sink_custom_python.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass, field +from typing import Dict, Any + +from .base import BaseSink +from ..enums import SinkType + + +@dataclass(kw_only=True) +class TargetCustomPythonSink(BaseSink): + """ + Target details structure for Kafka Sinks. + + Attributes: + name (str): Name of the sink. + sinkOptions (Dict, optional): Options for the Delta writer. + """ + name: str + sinkOptions: Dict = field(default_factory=dict) + + def __post_init__(self): + BaseSink.__init__(self) + + @property + def sink_name(self) -> str: + """Returns the name of the sink.""" + return self.name + + @property + def sink_type(self) -> str: + """Returns the type of the sink.""" + return SinkType.DELTA_SINK + + @property + def sink_options(self) -> Dict[str, Any]: + """Returns the options for the sink configuration.""" + return self.sinkOptions diff --git a/src/dataflow/targets/sink_delta.py b/src/dataflow/targets/sink_delta.py new file mode 100644 index 0000000..4a590ca --- /dev/null +++ b/src/dataflow/targets/sink_delta.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass, field +from typing import Dict, Any + +from .base import BaseSink +from ..enums import SinkType + + +@dataclass(kw_only=True) +class TargetDeltaSink(BaseSink): + """ + Target details structure for Kafka Sinks. + + Attributes: + name (str): Name of the sink. + sinkOptions (Dict, optional): Options for the Delta writer. + """ + name: str + sinkOptions: Dict = field(default_factory=dict) + + def __post_init__(self): + BaseSink.__init__(self) + + @property + def sink_name(self) -> str: + """Returns the name of the sink.""" + return self.name + + @property + def sink_type(self) -> str: + """Returns the type of the sink.""" + return SinkType.DELTA_SINK + + @property + def sink_options(self) -> Dict[str, Any]: + """Returns the options for the sink configuration.""" + return self.sinkOptions diff --git a/src/dataflow/targets/sink_foreach_batch.py b/src/dataflow/targets/sink_foreach_batch.py new file mode 100644 index 0000000..b0c90a9 --- /dev/null +++ b/src/dataflow/targets/sink_foreach_batch.py @@ -0,0 +1,150 @@ +from dataclasses import dataclass, field +from typing import Dict, Any + +from pyspark import pipelines as dp + +import utility +from .base import BaseSink +from ..enums import SinkType +from ..sql import SqlMixin + + +@dataclass(frozen=True) +class ForEachBatchSinkType: + BASIC_SQL = "basic_sql" + PYTHON_FUNCTION = "python_function" + + +@dataclass(kw_only=True) +class TargetForEachBatchSink(BaseSink, SqlMixin): + """ + Target details structure for foreach batch sinks. + + Attributes: + name (str): Name of the sink. + type (str): Type of the sink. + config (Dict): Configuration for the sink. + """ + name: str + type: str + config: Dict = field(default_factory=dict) + + def __post_init__(self) -> None: + BaseSink.__init__(self) + if self.type == ForEachBatchSinkType.BASIC_SQL: + self.sqlPath = self.config.get("sqlPath") + self.sqlStatement = self.config.get("sqlStatement") + + @property + def sink_name(self) -> str: + """Returns the name of the sink.""" + return self.name + + @property + def sink_type(self) -> str: + """Returns the type of the sink.""" + return SinkType.FOREACH_BATCH_SINK + + @property + def sink_options(self) -> Dict[str, Any]: + """Returns the options for the sink configuration.""" + return self.config + + def create_sink(self) -> None: + """Create a sink with the specified name, type, and options.""" + logger = self.logger + logger.info(f"Creating Sink: {self.sink_name}, Type: {self.sink_type} - {self.type}") + logger.info(f"Config: {self.config}") + + if self.type == ForEachBatchSinkType.BASIC_SQL: + self._create_sink_basic_sql() + elif self.type == ForEachBatchSinkType.PYTHON_FUNCTION: + self._create_sink_python_function() + else: + raise ValueError(f"Invalid foreach batch type: {self.type}") + + def _create_sink_basic_sql(self) -> None: + @dp.foreach_batch_sink( + name=self.sink_name + ) + def batch_function(df, batch_id): + spark = self.spark + + # Create a temporary view + temp_view_name = f"micro_batch_view_{self.sink_name}" + df.createOrReplaceTempView(temp_view_name) + + # Get SQL and replace the temporary view name + sql = self.substitution_manager.substitute_string(self.rawSql) + sql = sql.replace("micro_batch_view", temp_view_name) + df_transformed = spark.sql(sql) + + partition_by = self.config.get("partitionBy", None) + cluster_by = self.config.get("clusterBy", None) + write_command = df_transformed.write.format("delta").mode("append") + if cluster_by: + write_command = write_command.clusterBy(cluster_by) + elif partition_by: + write_command = write_command.partitionBy(partition_by) + + # Write to Delta Table + path = self.config.get("path", None) + if path: + path = self.substitution_manager.substitute_string(path) + write_command.save(path) + else: + database = self.config.get("database", None) + table = self.config.get("table", None) + table_name = f"{database}.{table}" + + # TO DO: investigate if this instead of exception approach + #if spark.catalog.tableExists(table_name): + try: + spark.sql(f"DESCRIBE TABLE {table_name}") + # Append if table exists + write_command.saveAsTable(table_name) + except Exception: + # Create table if it does not exist + write_command.saveAsTable(table_name) + + table_properties = self.config.get("tableProperties", None) + table_properties_str = ", ".join([f"'{key}' = '{value}'" for key, value in table_properties.items()]) + if table_properties: + alter_sql = f"ALTER TABLE {table_name} SET TBLPROPERTIES ({table_properties_str})" + spark.sql(alter_sql) + + def _create_sink_python_function(self) -> None: + """ + Create a foreach batch sink using a Python function. + + Supports config with: + - module: Module.function reference (e.g., 'sinks.my_batch_handler') + - functionPath: Path to Python file containing 'micro_batch_function' + - tokens: Token values to pass to the function + """ + module_ref = self.config.get("module") + function_path = self.config.get("functionPath") + tokens = self.config.get("tokens", {}) + + if tokens: + tokens = self.substitution_manager.substitute_dict(tokens) + + # Load the function from module or path + if module_ref: + self.logger.debug(f"Loading batch function from module: {module_ref}") + python_function = utility.load_python_function_from_module(module_ref) + elif function_path: + self.logger.debug(f"Loading batch function from path: {function_path}") + python_function = utility.load_python_function( + function_path, + "micro_batch_function", + ["df", "batch_id", "tokens"] + ) + else: + raise ValueError("config must specify either 'functionPath' or 'module'") + + @dp.foreach_batch_sink( + name=self.sink_name + ) + def batch_function(df, batch_id): + python_function(df, batch_id, tokens) diff --git a/src/dataflow/targets/sink_kafka.py b/src/dataflow/targets/sink_kafka.py new file mode 100644 index 0000000..141ee17 --- /dev/null +++ b/src/dataflow/targets/sink_kafka.py @@ -0,0 +1,37 @@ +from dataclasses import dataclass, field +from typing import Dict, Any + +from .base import BaseSink +from ..enums import SinkType + + +@dataclass(kw_only=True) +class TargetKafkaSink(BaseSink): + """ + Target details structure for Kafka Sinks. + + Attributes: + name (str): Name of the sink. + sinkOptions (Dict, optional): Options for the Kafka writer. + """ + name: str + sinkOptions: Dict = field(default_factory=dict) + + def __post_init__(self): + """Post init validation.""" + BaseSink.__init__(self) + + @property + def sink_name(self) -> str: + """Returns the name of the sink.""" + return self.name + + @property + def sink_type(self) -> str: + """Returns the type of the sink.""" + return SinkType.KAFKA_SINK + + @property + def sink_options(self) -> Dict[str, Any]: + """Returns the options for the sink configuration.""" + return self.sinkOptions diff --git a/src/dataflow/targets/staging_table.py b/src/dataflow/targets/staging_table.py new file mode 100644 index 0000000..aed322d --- /dev/null +++ b/src/dataflow/targets/staging_table.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass, field +from typing import Dict + +from ..cdc import CDCSettings +from ..cdc_snaphot import CDCSnapshotSettings + +from .delta_streaming_table import TargetDeltaStreamingTable + + +@dataclass(kw_only=True) +class StagingTable(TargetDeltaStreamingTable): + """ + A structure to hold a target used for defining a staging table inside a flow. + + Attributes: + cdcSettings (Dict, optional): CDC settings. + cdcSnapshotSettings (Dict, optional): CDC snapshot settings. + + Methods: + get_cdc_settings() -> CDCSettings: Get CDC settings. + get_cdc_snapshot_settings() -> CDCSnapshotSettings: Get CDC snapshot settings. + """ + + cdcSettings: Dict = field(default_factory=dict) + cdcSnapshotSettings: Dict = field(default_factory=dict) + + def get_cdc_settings(self) -> CDCSettings: + """Get CDC configuration.""" + return CDCSettings(**self.cdcSettings) \ + if self.cdcSettings else None + + def get_cdc_snapshot_settings(self) -> CDCSnapshotSettings: + """Get CDC snapshot settings.""" + return CDCSnapshotSettings(**self.cdcSnapshotSettings) \ + if self.cdcSnapshotSettings else None diff --git a/src/dataflow/view.py b/src/dataflow/view.py new file mode 100644 index 0000000..18effe6 --- /dev/null +++ b/src/dataflow/view.py @@ -0,0 +1,98 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Union, Optional + +from pyspark import pipelines as dp +from pyspark.sql import DataFrame + +import pipeline_config + +from .dataflow_config import DataFlowConfig +from .enums import SourceType +from .sources import SourceFactory, ReadConfig + + +@dataclass +class ViewConfig: + """View configuration. + + Attributes: + target_config_flags: The target config flags. + """ + target_config_flags: Optional[List[str]] = None + + +@dataclass +class View: + """ + View definition structure. + + Attributes: + viewName (str): Name of the view. + mode (str): Execution mode. + sourceType (str): Type of source. + sourceDetails (Dict): Source details. + + Properties: + isCdfEnabled (bool): Flag indicating if CDF is enabled for Delta sources. + + Methods: + add_reader_options(reader_options: Dict): Add or update reader options. + get_source_details(): Get source details based on source type. + """ + viewName: str + mode: str + sourceType: str + sourceDetails: Union[Dict, List] + _config: ReadConfig = None + + def __post_init__(self): + self.mode = self.mode.lower() + self.sourceType = self.sourceType.lower() + self.read_config = None + + @property + def isCdfEnabled(self): + """Check if CDF is enabled for Delta sources.""" + cdf_enabled = False + if self.sourceType == SourceType.DELTA: + cdf_enabled = self.sourceDetails["cdfEnabled"] \ + if "cdfEnabled" in self.sourceDetails else False + return cdf_enabled + + def add_reader_options(self, reader_options: Dict): + """Add or update reader options.""" + if "readerOptions" in self.sourceDetails: + self.sourceDetails["readerOptions"].update(reader_options) + else: + self.sourceDetails["readerOptions"] = reader_options + + def create_view( + self, + dataflow_config: DataFlowConfig, + view_config: Optional[ViewConfig] = None, + quarantine_rules: Optional[str] = None + ): + """Create the View""" + self.read_config = ReadConfig( + features=dataflow_config.features, + mode=self.mode, + quarantine_rules=quarantine_rules, + uc_enabled=dataflow_config.uc_enabled, + target_config_flags=view_config.target_config_flags if view_config else None + ) + logger = pipeline_config.get_logger() + logger.info("Creating View: %s, mode: %s, source type: %s", self.viewName, self.mode , self.sourceType) + + dp.view( + self._get_df, + name=self.viewName, + comment=f"input dataset view for {self.viewName}", + ) + + def get_source_details(self): + """Get the views source details based on source type.""" + return SourceFactory.create(self.sourceType, self.sourceDetails) + + def _get_df(self) -> DataFrame: + """Retrieve the DataFrame based on the configured source type.""" + return self.get_source_details().read_source(self.read_config) diff --git a/src/dataflow_spec_builder/__init__.py b/src/dataflow_spec_builder/__init__.py new file mode 100644 index 0000000..48bdb03 --- /dev/null +++ b/src/dataflow_spec_builder/__init__.py @@ -0,0 +1,10 @@ +from .dataflow_spec_builder import DataflowSpecBuilder +from .expectations_builder import DataQualityExpectationBuilder +from .spec_mapper import SpecMapper + + +__all__ = [ + 'DataflowSpecBuilder', + 'DataQualityExpectationBuilder', + 'SpecMapper' +] diff --git a/src/dataflow_spec_builder/dataflow_spec_builder.py b/src/dataflow_spec_builder/dataflow_spec_builder.py new file mode 100644 index 0000000..09b4bc1 --- /dev/null +++ b/src/dataflow_spec_builder/dataflow_spec_builder.py @@ -0,0 +1,698 @@ +import json +import os + +from typing import Dict, List, Optional +from concurrent.futures import ThreadPoolExecutor, as_completed + +from constants import ( + FrameworkPaths, PipelineBundlePaths, SupportedSpecFormat, PipelineBundleSuffixesJson, PipelineBundleSuffixesYaml +) +from dataflow.dataflow_spec import DataflowSpec +import pipeline_config +from secrets_manager import SecretsManager +import utility + +from .expectations_builder import DataQualityExpectationBuilder +from .spec_mapper import SpecMapper +from .template_processor import TemplateProcessor +from .transformer import SpecTransformerFactory + + +class DataflowSpecBuilder: + """ + Dataflow specification builder. + + Attributes: + LOCALISE_PATHS (Dict): Paths to localize. + logger: Logger instance. + dataflow_path (str): Path to the dataflow. + framework_path (str): Path to the framework. + filters (Dict): Filters for the dataflow. + substitution_manager (SubstitutionManager): Substitution manager instance. + secrets_manager (SecretsManager): Secrets manager instance. + max_workers (int): Maximum number of worker threads for parallel file loading. + filter_list_data_flow_ids (List[str]): List of data flow IDs to filter. + filter_list_data_flow_groups (List[str]): List of data flow groups to filter. + filter_list_flow_group_ids (List[str]): List of flow group IDs to filter. + filter_list_target_tables (List[str]): List of target tables to filter. + filter_list_files (List[str]): List of files to filter. + main_validator: Main JSON validator. + flow_validator: Flow JSON validator. + dataflow_spec_version (str): Path to the dataflow spec mapping file. + dataflow_spec_list (List): List of dataflow specifications. + validation_errors (Dict): Dictionary of validation errors. + + Methods: + build(): Build dataflow specifications. + """ + + class Keys: + """Constants for dictionary keys for the dataflow spec JSON files, and final dataflow spec format""" + # Core dataflow specification keys + DATA_FLOW_ID = "dataFlowId" + DATA_FLOW_GROUP = "dataFlowGroup" + DATA_FLOW_TYPE = "dataFlowType" + DATA_FLOW_VERSION = "dataFlowVersion" + + # Target configuration keys + TARGET_DETAILS = "targetDetails" + TARGET_TABLE = "targetTable" + TABLE = "table" + + # Data payload key + DATA = "data" + + # Flow group keys + FLOW_GROUPS = "flowGroups" + FLOW_GROUP_ID = "flowGroupId" + + # Data quality keys (from dataflow spec) + DATA_QUALITY_EXPECTATIONS_ENABLED = "dataQualityExpectationsEnabled" + DATA_QUALITY_EXPECTATIONS_PATH = "dataQualityExpectationsPath" + DATA_QUALITY_EXPECTATIONS = "dataQualityExpectations" + + # Path keys + LOCAL_PATH = "localPath" + + # Mapping keys + GLOBAL = "global" + + # Template keys + TEMPLATE = "template" + DATAFLOW_SPEC_PARAMS = "parameterSets" + PARAMS = "parameters" + + LOCALISE_PATHS = { + "schemaPath": PipelineBundlePaths.SCHEMA_PATH, + "sqlPath": PipelineBundlePaths.DML_PATH, + "functionPath": PipelineBundlePaths.PYTHON_FUNCTION_PATH, # For sourceType: python + } + + def __init__( + self, + bundle_path: str, + framework_path: str, + filters: Dict, + secrets_manager: SecretsManager, + ignore_validation_errors: bool = False, + dataflow_spec_version: str = None, + max_workers: int = 10, + spec_file_format: str = SupportedSpecFormat.JSON.value + ): + self.bundle_path = bundle_path + self.framework_path = framework_path + self.filters = filters + self.secrets_manager = secrets_manager + self.ignore_validation_errors = ignore_validation_errors + self.global_dataflow_spec_version = dataflow_spec_version + self.max_workers = max_workers + self.spec_file_format = spec_file_format.lower() + + valid_formats = [fmt.value for fmt in SupportedSpecFormat] + if self.spec_file_format not in valid_formats: + raise ValueError(f"Invalid enabled format: {self.spec_file_format}. Valid formats are: {valid_formats}") + + self.dataflow_path = os.path.join(bundle_path, PipelineBundlePaths.DATAFLOWS_BASE_PATH) + + self.logger = pipeline_config.get_logger() + self.substitution_manager = pipeline_config.get_substitution_manager() + + # Initialize spec mapper for version migrations + self.spec_mapper = SpecMapper(self.framework_path, self.max_workers) + + # Initialize template processor + self.template_processor = TemplateProcessor(self.bundle_path, self.framework_path) + + # Initialize filters + self.filter_data_flow_ids = self._parse_filter(self.filters.get("data_flow_ids")) + self.filter_data_flow_groups = self._parse_filter(self.filters.get("data_flow_groups")) + self.filter_flow_group_ids = self._parse_filter(self.filters.get("flow_group_ids")) + self.filter_target_tables = self._parse_filter(self.filters.get("target_tables")) + self.filter_files = self._parse_filter(self.filters.get("files")) + + # Initialize validators + self.main_validator = utility.JSONValidator( + os.path.join(self.framework_path,FrameworkPaths.MAIN_SPEC_SCHEMA_PATH)) + self.flow_validator = utility.JSONValidator( + os.path.join(self.framework_path,FrameworkPaths.FLOW_GROUP_SPEC_SCHEMA_PATH)) + + # Initialize storage + self.processed_specs: List[DataflowSpec] = [] + self.validation_errors: Dict[str, str] = {} + + def _post_init(self) -> None: + """Post-initialization setup.""" + if not os.path.exists(self.bundle_path): + raise FileNotFoundError(f"Path does not exist in Workspace files, or is inaccessible: {self.bundle_path}") + if not os.path.exists(self.dataflow_path): + raise FileNotFoundError(f"Path does not exist in Workspace files, or is inaccessible: {self.dataflow_path}") + + @staticmethod + def _parse_filter(filter_str: Optional[str]) -> List[str]: + """Parse a filter string into a list of lowercase, stripped values.""" + return [item.lower().strip() for item in filter_str.split(",")] if filter_str else [] + + def build(self) -> List[DataflowSpec]: + """ + Build DataflowSpec instances from configuration files. + + Returns: + List[DataflowSpec]: List of processed dataflow specifications + + Raises: + ValueError: If any validation errors are encountered + """ + self.logger.info( + "Dataflow Spec Builder - Commencing Build, Settings:\n" + f" Dataflow Path: {self.dataflow_path}\n" + f" Max Workers: {self.max_workers}\n" + f" Filters: {json.dumps(self.filters, indent=8)}\n" + f" Ignore Validation Errors: {self.ignore_validation_errors}\n" + f" Global Dataflow Spec Version (optional): {self.global_dataflow_spec_version}" + ) + + self.logger.info("Reading and filtering dataflow specs...") + main_specs, flow_specs = self._read_dataflow_specs() + main_specs = self._filter_dataflow_specs(main_specs) + + # Apply dataflow spec version mapping + self.logger.info("Applying dataflow spec version mapping...") + main_specs = self._apply_dataflow_spec_mapping(main_specs) + flow_specs = self._apply_dataflow_spec_mapping(flow_specs) + + # Validate dataflow specs + self.logger.info("Validating dataflow specs...") + self._validate_dataflow_specs({**main_specs, **flow_specs}) + + # Merge flow groups + self.logger.info("Merging flow groups...") + spec_payloads = self._merge_flow_groups(main_specs, flow_specs) + + # Transform dataflow specs + self.logger.info("Transforming dataflow specs...") + spec_payloads = self._transform_specs(spec_payloads) + + # Final processing of dataflow specs + self.logger.info("Final processing of dataflow specs...") + self._process_specs(spec_payloads) + + return self.processed_specs + + def _read_dataflow_specs(self) -> Dict: + """Read dataflow specifications based on filters.""" + main_specs = {} + flow_specs = {} + + def _extract_spec(spec: Dict) -> Dict: + """Extract metadata from a dataflow specification.""" + return { + "fileType": "main", + self.Keys.DATA_FLOW_ID: spec.get(self.Keys.DATA_FLOW_ID, None), + self.Keys.DATA_FLOW_GROUP: spec.get(self.Keys.DATA_FLOW_GROUP, None), + self.Keys.DATA_FLOW_TYPE: spec.get(self.Keys.DATA_FLOW_TYPE, None), + self.Keys.TARGET_TABLE: spec.get(self.Keys.TARGET_DETAILS, {}).get(self.Keys.TABLE, None), + self.Keys.DATA: spec + } + + def _validate_missing_metadata(spec: Dict) -> Dict: + """Validate a dataflow specification for missing metadata.""" + missing_metadata = [] + if not spec.get(self.Keys.DATA_FLOW_ID): + missing_metadata.append(self.Keys.DATA_FLOW_ID) + if not spec.get(self.Keys.DATA_FLOW_GROUP): + missing_metadata.append(self.Keys.DATA_FLOW_GROUP) + if not spec.get(self.Keys.DATA_FLOW_TYPE): + missing_metadata.append(self.Keys.DATA_FLOW_TYPE) + return missing_metadata + + if self.filter_files: + self.logger.info(f"Loading dataflow specifications by file filters: {self.filters}") + # Convert filter paths to full file paths and load data + for filter_path in self.filter_files: + + full_path = os.path.join(self.dataflow_path, filter_path) + if not self._validate_file_path(full_path): + continue + + data = utility.load_config_file_auto(full_path, fail_on_not_exists=True) + if data: + main_specs[full_path] = _extract_spec(data) + + else: + suffixes = utility.get_format_suffixes(self.spec_file_format, "main_spec") + suffixes.extend(utility.get_format_suffixes(self.spec_file_format, "flow_group")) + + self.logger.info(f"Loading {self.spec_file_format.upper()} dataflow specifications recursively...") + categorized_files = utility.get_data_from_files_parallel( + path=self.dataflow_path, + file_format=self.spec_file_format, + file_suffix=suffixes, + recursive=True, + max_workers=self.max_workers + ) + + main_files_data = {} + flow_files_data = {} + for key, value in categorized_files.items(): + # Normalize key to always be iterable + keys_to_check = key if isinstance(key, tuple) else [key] + if any("_main." in suffix for suffix in keys_to_check): + main_files_data.update(value) + elif any("_flow." in suffix for suffix in keys_to_check): + flow_files_data.update(value) + + if not main_files_data: + valid_formats = [fmt.value for fmt in SupportedSpecFormat] + raise ValueError( + f"No dataflow specification files found in: {self.dataflow_path}. " + f"Spec format set to: {self.spec_file_format.upper()}. " + f"Valid formats are: {valid_formats}" + ) + else: + self.logger.info(f"Found {len(main_files_data)} main spec files.") + + # Process all specs in one pass: expand templates, extract regular specs + main_specs = {} + for file_path, json_data in main_files_data.items(): + if self.Keys.TEMPLATE in json_data: + # Template: expand and extract each resulting spec + expanded_specs = self.template_processor.process_template_spec( + file_path, + json_data, + self.spec_file_format + ) + for expanded_path, expanded_json in expanded_specs.items(): + main_specs[expanded_path] = _extract_spec(expanded_json) + else: + main_specs[file_path] = _extract_spec(json_data) + + # Load and process flow group files if any exist + if flow_files_data: + flow_specs = { + file_path: { + "fileType": "flow_group", + self.Keys.DATA_FLOW_ID: json_data.get(self.Keys.DATA_FLOW_ID), + self.Keys.DATA: json_data + } + for file_path, json_data in flow_files_data.items() + } + + metadata_validation_errors = {} + for spec_path, spec_data in main_specs.items(): + missing_metadata = _validate_missing_metadata(spec_data) + if missing_metadata: + metadata_validation_errors[spec_path] = f"Missing metadata: {missing_metadata}" + + if not self.ignore_validation_errors and metadata_validation_errors: + raise ValueError(f"Dataflow Spec Metadata Validation Errors:\n{json.dumps(metadata_validation_errors, indent=2)}") + + self.logger.info(f"Loaded {len(main_specs)} dataflow specifications.") + + return main_specs, flow_specs + + def _merge_flow_groups(self, specs: Dict, flow_groups: Dict) -> Dict: + """Merge flow groups into the dataflow specifications.""" + for spec_path, spec_data in specs.items(): + dataflow_type = spec_data.get(self.Keys.DATA_FLOW_TYPE) + dataflow_id = spec_data.get(self.Keys.DATA_FLOW_ID) + if not dataflow_id or not dataflow_type: + continue + + if dataflow_type == "flow" and dataflow_id in flow_groups: + # Get existing flow groups from main spec + existing_flow_groups = spec_data.get(self.Keys.FLOW_GROUPS, []) + new_flow_groups = flow_groups[dataflow_id] + + # Check for duplicate flow group IDs + existing_ids = {group.get(self.Keys.FLOW_GROUP_ID) for group in existing_flow_groups if group.get(self.Keys.FLOW_GROUP_ID)} + new_ids = {group.get(self.Keys.FLOW_GROUP_ID) for group in new_flow_groups if group.get(self.Keys.FLOW_GROUP_ID)} + duplicate_ids = existing_ids & new_ids + + if duplicate_ids: + message = ( + f"Duplicate flow group IDs found for dataflow '{dataflow_id}': {duplicate_ids}. " + f"Flow group IDs must be unique across main spec and flow group files." + ) + self.logger.error(message) + self.validation_errors[spec_path] = message + + if not self.ignore_validation_errors: + raise ValueError(message) + + # Merge flow groups + self.logger.info(f"Merging flow group files for dataflow: '{dataflow_id}'. Flow Group IDs: {new_ids}") + if not existing_flow_groups: + spec_data[self.Keys.FLOW_GROUPS] = new_flow_groups + else: + spec_data[self.Keys.FLOW_GROUPS].extend(new_flow_groups) + + return specs + + def _filter_dataflow_specs(self, specs: Dict) -> Dict: + """Apply filters to a dataflow specification.""" + filtered_specs = {} + filtered_out_notifications = [] + kept_notifications = [] + for spec_path, spec_payload in specs.items(): + + # Filter dataflow specs on the main filters + if self._matches_filters(spec_payload): + filtered_specs[spec_path] = spec_payload + kept_notifications.append( + f"ID: {spec_payload.get(self.Keys.DATA_FLOW_ID)}, " + f"Group: {spec_payload.get(self.Keys.DATA_FLOW_GROUP)}, " + f"Path: {spec_path}" + ) + # Filter flow groups if the dataflow type is flow + if self.filter_flow_group_ids and spec_payload.get(self.Keys.DATA_FLOW_TYPE) == "flow": + spec_payload[self.Keys.FLOW_GROUPS] = [ + group for group in spec_payload.get(self.Keys.FLOW_GROUPS, []) + if group[self.Keys.FLOW_GROUP_ID].strip().lower() in self.filter_flow_group_ids + ] + else: + filtered_out_notifications.append( + f"ID: {spec_payload.get(self.Keys.DATA_FLOW_ID)}, " + f"Group: {spec_payload.get(self.Keys.DATA_FLOW_GROUP)}, " + f"Path: {spec_path}" + ) + + if kept_notifications: + self.logger.info("The following dataflow specs were kept:\n" + '\n'.join(kept_notifications) + "\n") + + if filtered_out_notifications: + self.logger.info( + "The following dataflow specs were filtered out:\n" + '\n'.join(filtered_out_notifications) + "\n" + ) + + return filtered_specs + + def _matches_filters(self, spec_payload: Dict) -> bool: + """Check if a specification matches the current filters.""" + data_flow_id = spec_payload.get(self.Keys.DATA_FLOW_ID).lower() + data_flow_group = spec_payload.get(self.Keys.DATA_FLOW_GROUP).lower() + spec_has_target_table = spec_payload.get(self.Keys.TARGET_DETAILS, {}).get(self.Keys.TABLE, None) is not None + target_table = spec_payload.get(self.Keys.TARGET_DETAILS, {}).get(self.Keys.TABLE, "").lower() + + if self.filter_data_flow_ids and data_flow_id not in self.filter_data_flow_ids: + return False + if self.filter_data_flow_groups and data_flow_group not in self.filter_data_flow_groups: + return False + if self.filter_target_tables and spec_has_target_table and target_table not in self.filter_target_tables: + return False + + return True + + def _apply_dataflow_spec_mapping(self, specs: Dict) -> Dict: + """Apply the dataflow spec mapping to the dataflow specs using parallel processing.""" + return self.spec_mapper.apply_mappings( + specs, + global_version=self.global_dataflow_spec_version, + ignore_errors=self.ignore_validation_errors + ) + + def _validate_dataflow_specs(self, spec_payloads: Dict) -> None: + """Validate the dataflow specifications using parallel processing.""" + if not spec_payloads: + return + + def _validate_json(spec_item: tuple) -> tuple: + """Validate JSON data against a schema.""" + spec_path, spec_payload = spec_item + file_type = spec_payload.get("fileType", "main") + json_data = spec_payload.get(self.Keys.DATA) + if file_type == "main": + errors = self.main_validator.validate(json_data) + else: + errors = self.flow_validator.validate(json_data) + return spec_path, errors + + validation_errors = {} + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + # Submit all validation tasks + future_to_path = { + executor.submit(_validate_json, (spec_path, spec_payload)): spec_path + for spec_path, spec_payload in spec_payloads.items() + } + + # Collect results + for future in as_completed(future_to_path): + spec_path, errors = future.result() + if errors: + validation_errors[spec_path] = errors + + self.validation_errors.update(validation_errors) + if validation_errors: + self.logger.warning(f"Invalid dataflow spec files found:\n{validation_errors}") + if not self.ignore_validation_errors: + raise ValueError(f"Invalid dataflow spec files found:\n{validation_errors}") + + def _transform_specs(self, spec_payloads: Dict) -> Dict: + """Transform the dataflow specs using parallel processing.""" + if not spec_payloads: + return spec_payloads + + def _transform_spec_worker(spec_item: tuple) -> tuple: + """Worker function to transform a single spec.""" + spec_path, spec_payload = spec_item + + try: + dataflow_type = spec_payload.get(self.Keys.DATA_FLOW_TYPE, "").strip().lower() + spec_data = spec_payload.get(self.Keys.DATA) + transformer = SpecTransformerFactory.create_transformer(dataflow_type) + transformed_spec = transformer.transform(spec_data) + spec_payload[self.Keys.DATA] = transformed_spec + return spec_path, spec_payload, None + + except Exception as e: + return spec_path, spec_payload, str(e) + + results = {} + errors = {} + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + # Submit all transformation tasks + future_to_path = { + executor.submit(_transform_spec_worker, (spec_path, spec_payload)): spec_path + for spec_path, spec_payload in spec_payloads.items() + } + + # Collect results + for future in as_completed(future_to_path): + spec_path, spec_payload, error = future.result() + if error: + errors[spec_path] = error + self.logger.warning(f"Failed to transform spec {spec_path}: {error}") + results[spec_path] = spec_payload + + if errors and not self.ignore_validation_errors: + self.logger.warning(f"Some specs failed during transformation: {errors}") + + return results + + def _process_specs(self, spec_payloads: Dict) -> None: + """Add a processed dataflow specification.""" + for spec_path, spec_payload in spec_payloads.items(): + base_path = self._get_base_path(spec_path) + spec_data = spec_payload.get(self.Keys.DATA) + if isinstance(spec_data, list): + for spec in spec_data: + self._process_spec_data(base_path, spec) + else: + self._process_spec_data(base_path, spec_data) + + def _process_spec_data(self, base_path: str, spec_data: Dict) -> None: + """Process the dataflow specification data.""" + # Get Expectations + spec_data = self._get_expectations(spec_data, base_path) + + # Localize paths (also sets LOCAL_PATH) + spec_data = self._localize_paths(spec_data, base_path) + + # Substitute tokens in the dataflow spec + spec_data = self.substitution_manager.substitute_dict(spec_data) + + # Substitute secrets in the dataflow spec with SecretValue objects + spec_data = self.secrets_manager.substitute_secrets(spec_data) + + self.logger.info(f"Adding Dataflow Spec: {spec_data.get(self.Keys.DATA_FLOW_ID)}.") + self.processed_specs.append(DataflowSpec(**spec_data)) + + # TODO: Replace with service Locator pattern + def _localize_paths(self, spec_data: Dict, base_path: str) -> Dict: + """ + Convert relative paths to absolute paths in the specification with fallback logic. + Recursively processes the entire spec structure to find and resolve all path references. + """ + spec_data[self.Keys.LOCAL_PATH] = base_path + self._localize_paths_recursive(spec_data, base_path, spec_data) + return spec_data + + def _localize_paths_recursive(self, obj, base_path: str, root_spec_data: Dict): + """Recursively traverse the spec structure and resolve path keys wherever they appear.""" + if isinstance(obj, dict): + for key, value in obj.items(): + # Check if this key is a simple path that needs resolution (e.g., "schemaPath") + if key in DataflowSpecBuilder.LOCALISE_PATHS and isinstance(value, str): + resolved_value = self._resolve_path_value( + key, value, base_path, root_spec_data + ) + obj[key] = resolved_value + + if isinstance(value, (dict, list)): + self._localize_paths_recursive(value, base_path, root_spec_data) + + elif isinstance(obj, list): + for item in obj: + self._localize_paths_recursive(item, base_path, root_spec_data) + + def _resolve_path_value(self, key: str, value: str, base_path: str, root_spec_data: Dict) -> str: + """Resolve a single path value based on its key type. Always returns normalized path.""" + if self._is_valid_absolute_path(value): + return value + + if not value: + return value + + # For Python function paths, use enhanced resolution with fallbacks + if key == "functionPath": + try: + return self._resolve_python_function_path(value, base_path, root_spec_data) + except FileNotFoundError: + self.logger.error( + f"Python function '{value}' not found in any search location, " + f"keeping original path" + ) + #TODO: pipe through to validation errors + return os.path.normpath(value) + else: + # Standard path resolution + subpath = DataflowSpecBuilder.LOCALISE_PATHS[key] + resolved_path = os.path.normpath( + os.path.join(base_path, PipelineBundlePaths.DATAFLOW_SPEC_PATH, subpath, value) + ) + if os.path.exists(resolved_path): + return resolved_path + + resolved_path = os.path.normpath(os.path.join(base_path, subpath, value)) + if os.path.exists(resolved_path): + return resolved_path + + return resolved_path + + def _is_valid_absolute_path(self, path: str) -> bool: + """Check if path is truly absolute and within our known base paths.""" + if not os.path.isabs(path): + return False + + # Normalize paths for comparison (resolve symlinks, remove trailing slashes) + normalized_path = os.path.normpath(os.path.abspath(path)) + normalized_bundle = os.path.normpath(os.path.abspath(self.bundle_path)) + normalized_framework = os.path.normpath(os.path.abspath(self.framework_path)) + + # Check if path starts with bundle_path or framework_path + if normalized_path.startswith(normalized_bundle): + self.logger.debug(f"Path '{path}' is absolute within bundle path") + return True + + if normalized_path.startswith(normalized_framework): + self.logger.debug(f"Path '{path}' is absolute within framework path") + return True + + # Path is absolute but not within our known locations - log warning but accept it + # This allows for legitimate absolute paths outside our bundle/framework + self.logger.warning( + f"Path '{path}' is absolute but not within bundle or framework paths. " + f"Using as-is, but this may cause issues if the path is not valid." + ) + return True + + def _resolve_python_function_path(self, filename: str, base_path: str, spec_data: Dict) -> str: + """ + Resolve Python function path with context-aware fallback logic. + + Search order for regular specs: + 1. base_path/python_functions/ + 2. bundle_path/extensions/python_functions/ + 3. framework_path/extensions/python_functions/ + + Search order for template-generated specs (adds one additional location): + 1. base_path/python_functions/ + 2. bundle_path/templates/python_functions/ + 3. bundle_path/extensions/python_functions/ + 4. framework_path/extensions/python_functions/ + """ + search_paths = { + "base dataflow directory": + os.path.join(base_path, PipelineBundlePaths.PYTHON_FUNCTION_PATH, filename), + "templates directory": + os.path.join(self.bundle_path, PipelineBundlePaths.TEMPLATE_PATH, + PipelineBundlePaths.PYTHON_FUNCTION_PATH, filename), + "bundle extensions directory": + os.path.join(self.bundle_path, PipelineBundlePaths.EXTENSIONS_PATH, filename), + "framework extensions directory": + os.path.join(self.framework_path, FrameworkPaths.EXTENSIONS_PATH, filename), + } + + # Template-specific location (only for template-generated specs) + is_template_generated = spec_data.get("tags", {}).get("_isTemplateGenerated", False) + if not is_template_generated: + search_paths.pop("templates directory") + + # Try each path in order + for key, path in search_paths.items(): + normalized_path = os.path.normpath(path) + if os.path.exists(normalized_path): + self.logger.debug(f"Resolved Python function '{filename}' from {key}: {normalized_path}") + return normalized_path + + spec_path = f"{base_path}/{spec_data.get(self.Keys.DATA_FLOW_ID)}" + self.validation_errors[spec_path] = f"Python function file '{filename}' not found in any search location" + + return filename + + def _validate_file_path(self, path: str) -> bool: + """Validate a file path meets all requirements.""" + + if self.spec_file_format == SupportedSpecFormat.JSON.value: + if not any(path.endswith(suffix) for suffix in PipelineBundleSuffixesJson.MAIN_SPEC_FILE_SUFFIX): + self.validation_errors[path] = f"Invalid file filter: {path}.\nFile format must be JSON" + return False + elif self.spec_file_format == SupportedSpecFormat.YAML.value: + if not any(path.endswith(suffix) for suffix in PipelineBundleSuffixesYaml.MAIN_SPEC_FILE_SUFFIX): + self.validation_errors[path] = f"Invalid file filter: {path}.\nFile format must be YAML" + return False + + if not os.path.exists(path): + self.validation_errors[path] = f"File not found: {path}" + return False + + return True + + def _get_base_path(self, file_path: str) -> str: + """Get the base path for a file.""" + base_path = os.path.dirname(file_path) + if base_path.endswith(PipelineBundlePaths.DATAFLOW_SPEC_PATH): + base_path = base_path[:-len(PipelineBundlePaths.DATAFLOW_SPEC_PATH)] + if base_path.endswith('/'): + base_path = base_path[:-1] + self.logger.debug(f"Base path for associated files: {base_path}") + return base_path + + def _get_expectations(self, dataflow_spec: Dict, base_path: str) -> Dict: + """Set the expectation validator path in the dataflow specification.""" + dqe_validator_path = os.path.join(self.framework_path,FrameworkPaths.EXPECTATIONS_SPEC_SCHEMA_PATH) + if dataflow_spec.get(self.Keys.DATA_QUALITY_EXPECTATIONS_ENABLED, False): + dqe_path = dataflow_spec.get(self.Keys.DATA_QUALITY_EXPECTATIONS_PATH, None) + if dqe_path is None or dqe_path.strip() == "": + raise ValueError("Data quality expectations path is not set in Dataflow Spec") + + dqe_path = f"{base_path}/{PipelineBundlePaths.DQE_PATH}/{dqe_path}" + dataflow_spec[self.Keys.DATA_QUALITY_EXPECTATIONS] = ( + DataQualityExpectationBuilder( + self.logger, + dqe_validator_path, + self.spec_file_format + ).get_expectations(dqe_path).__dict__) + + return dataflow_spec diff --git a/src/dataflow_spec_builder/expectations_builder.py b/src/dataflow_spec_builder/expectations_builder.py new file mode 100644 index 0000000..ebe5ccf --- /dev/null +++ b/src/dataflow_spec_builder/expectations_builder.py @@ -0,0 +1,151 @@ +import logging +import os +from typing import Dict + +import utility + +from constants import SupportedSpecFormat +from dataflow.expectations import DataQualityExpectations, ExpectationType + + +class DataQualityExpectationBuilder: + """ + Builds data quality expectations from JSON/YAML files using a validation schema. + + Attributes: + logger (logging.Logger): Logger instance for logging messages. + validator (JSONValidator): JSON validator object. + spec_file_format (str): Specification file format (json or yaml). + + Methods: + get_expectation_rules(expectations: Dict, expectation_type: str, tag: str = None) -> Dict: + Retrieves expectation rules of a specific type from expectations. + get_expectations(path: str) -> DataQualityExpectations: + Retrieves data quality expectations from JSON/YAML files. + """ + + def __init__( + self, + logger: logging.Logger, + json_validation_schema_path: str, + spec_file_format: str = "json" + ): + """Initialize the DataQualityExpectationBuilder. + + Args: + logger: Logger instance for logging messages. + json_validation_schema_path: Path to the JSON validation schema. + spec_file_format: Specification file format (json or yaml). Defaults to "json". + + Raises: + ValueError: If spec_file_format is not a valid format. + """ + self.logger = logger + self.validator = utility.JSONValidator(json_validation_schema_path) + self.spec_file_format = spec_file_format.lower() + + valid_formats = [fmt.value for fmt in SupportedSpecFormat] + if self.spec_file_format not in valid_formats: + raise ValueError( + f"Invalid spec file format: '{self.spec_file_format}'. " + f"Valid formats are: {valid_formats}" + ) + + def get_expectation_rules(self, expectations: Dict, expectation_type: str, tag: str = None) -> Dict: + """Retrieve expectation rules of a specific type from expectations. + + Args: + expectations: Dictionary of expectations data. + expectation_type: Type of expectation to retrieve (EXPECT, EXPECT_OR_DROP, EXPECT_OR_FAIL). + tag: Optional tag to filter expectations. + + Returns: + Dictionary of expectation rules mapped by name to constraint. + """ + rules = None + if expectation_type in expectations: + rules = {} + for expectation in expectations[expectation_type]: + expectation_enabled = bool(expectation.get('enabled', True)) + if expectation_enabled: + if tag: + if expectation["tag"] == tag: + rules[expectation["name"]] = expectation["constraint"] + else: + rules[expectation["name"]] = expectation["constraint"] + return rules + + def _load_single_file(self, file_path: str) -> Dict: + """Load and validate a single expectations file.""" + self.logger.info("Loading expectations from file: %s", file_path) + + file_data = utility.load_config_file_auto(file_path, fail_on_not_exists=False) + + if not file_data: + raise ValueError(f"Expectations file not found: {file_path}") + + # Validate file data + errors = self.validator.validate(file_data) + if errors: + raise ValueError(f"Invalid expectations file '{file_path}': {errors}") + + return file_data + + def _load_directory(self, directory_path: str) -> Dict: + """Load and validate expectations files from a directory.""" + self.logger.info("Loading expectations from directory: %s", directory_path) + + # Get appropriate file suffix based on spec format + file_suffix = utility.get_format_suffixes(self.spec_file_format, "expectations") + + # Load all expectations files from directory + file_data = utility.load_config_files( + path=directory_path, + file_format=self.spec_file_format, + file_suffix=file_suffix, + recursive=False + ) + + if not file_data: + self.logger.warning("No expectations files found in directory: %s", directory_path) + return {} + + # Validate all files and merge data + validation_errors = {} + merged_expectations = {} + + for file_path, data in file_data.items(): + errors = self.validator.validate(data) + if errors: + validation_errors[file_path] = errors + else: + merged_expectations.update(data) + + if validation_errors: + raise ValueError(f"Invalid expectations files found: {validation_errors}") + + return merged_expectations + + def get_expectations(self, path: str) -> DataQualityExpectations: + """Retrieve data quality expectations from JSON/YAML files.""" + if not path or path.strip() == "": + raise ValueError("Expectations file path is not set") + + self.logger.info("Getting expectations from: %s", path) + + # Determine if path is a file or directory + if os.path.isfile(path): + expectations = self._load_single_file(path) + elif os.path.isdir(path): + expectations = self._load_directory(path) + else: + raise ValueError(f"Path does not exist: {path}") + + self.logger.debug("Loaded expectations: %s", expectations) + + return DataQualityExpectations( + expectationsJson=expectations, + expectRules=self.get_expectation_rules(expectations, ExpectationType.EXPECT), + expectOrDropRules=self.get_expectation_rules(expectations, ExpectationType.EXPECT_OR_DROP), + expectOrFailRules=self.get_expectation_rules(expectations, ExpectationType.EXPECT_OR_FAIL) + ) diff --git a/src/dataflow_spec_builder/spec_mapper.py b/src/dataflow_spec_builder/spec_mapper.py new file mode 100644 index 0000000..9ce1039 --- /dev/null +++ b/src/dataflow_spec_builder/spec_mapper.py @@ -0,0 +1,468 @@ +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Dict, Tuple, Optional, Any + +from constants import FrameworkPaths +import pipeline_config +import utility + + +class SpecMapper: + """ + Handles dataflow spec version migrations and key mappings. + + Supports operations: + - move: Move a key to a new location + - rename_all: Rename keys recursively throughout the spec + - rename_specific: Rename specific keys at exact paths + - delete: Delete keys from the spec + + Each operation supports conditional execution based on spec properties. + """ + + class Keys: + """Constants for spec mapping keys.""" + DATA = "data" + DATA_FLOW_ID = "dataFlowId" + DATA_FLOW_TYPE = "dataFlowType" + DATA_FLOW_VERSION = "dataFlowVersion" + GLOBAL = "global" + + class Operators: + """Constants for condition operators.""" + EQUAL_TO = "equal_to" + NOT_EQUAL_TO = "not_equal_to" + IN = "in" + NOT_IN = "not_in" + + def __init__(self, framework_path: str, max_workers: int = 1): + """ + Initialize the SpecMapper. + + Args: + framework_path: Path to the framework directory + max_workers: Maximum parallel workers for processing + """ + self.framework_path = framework_path + self.max_workers = max_workers + self._mapping_cache: Dict[str, Dict] = {} + + self.logger = pipeline_config.get_logger() + self.validator = utility.JSONValidator( + os.path.join(self.framework_path, FrameworkPaths.SPEC_MAPPING_SCHEMA_PATH) + ) + + def apply_mappings( + self, + specs: Dict, + global_version: Optional[str] = None, + ignore_errors: bool = False + ) -> Dict: + """ + Apply version mappings to specs using parallel processing. + + Args: + specs: Dictionary of spec_path -> spec_payload + global_version: Global mapping version to apply + ignore_errors: If True, continue processing on errors + + Returns: + Dictionary of processed specs + """ + self.logger.info( + "Spec Mapper - Applying Mappings:\n" + f" Max Workers: {self.max_workers}\n" + f" Global Dataflow Spec Version (optional): {global_version}" + f" Spec Count: {len(specs) if specs else 0}" + ) + + if not specs: + return specs + + # Process specs in parallel + results = {} + errors = {} + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + future_to_path = { + executor.submit( + self._apply_mapping_to_spec, + spec_path, + spec_payload, + global_version + ): spec_path + for spec_path, spec_payload in specs.items() + } + + for future in as_completed(future_to_path): + spec_path, spec_payload, error = future.result() + if error: + errors[spec_path] = error + self.logger.warning(f"Failed to apply mapping for {spec_path}: {error}") + results[spec_path] = spec_payload + + if errors and not ignore_errors: + self.logger.warning(f"Some specs failed during mapping: {errors}") + + return results + + def get_mapping(self, version: str) -> Dict: + """ + Load, validate, and cache mapping configuration for a version. + + Args: + version: Version string (e.g., "0.2.0") + + Returns: + Mapping configuration dictionary + """ + if version in self._mapping_cache: + return self._mapping_cache[version] + + mapping_path = os.path.join( + self.framework_path, + FrameworkPaths.DATAFLOW_SPEC_MAPPING_PATH, + version, + "dataflow_spec_mapping.json" + ) + + try: + mapping = utility.get_json_from_file(mapping_path, True) + + errors = self.validator.validate(mapping) + if errors: + raise ValueError(f"Spec mapping validation failed for {mapping_path}: {errors}") + + self._mapping_cache[version] = mapping + return mapping + except Exception as e: + msg = f"Error loading spec mapping version {version}: {str(e)}" + self.logger.error(msg) + raise FileNotFoundError(msg) from e + + def _apply_mapping_to_spec( + self, + spec_path: str, + spec_payload: Dict, + global_version: Optional[str] + ) -> Tuple[str, Dict, Optional[str]]: + """ + Apply mapping to a single spec. + + Returns: + Tuple of (spec_path, updated_payload, error_message) + """ + try: + spec_id = spec_payload.get(self.Keys.DATA_FLOW_ID, "").strip().lower() + spec_type = spec_payload.get(self.Keys.DATA_FLOW_TYPE, "").strip().lower() + spec_data = spec_payload.get(self.Keys.DATA) + + # Get mapping configuration + mapping = {} + if global_version: + mapping = self.get_mapping(global_version) + + # Check for spec-specific version override + spec_version = spec_data.get(self.Keys.DATA_FLOW_VERSION) + if spec_version: + mapping = self.get_mapping(spec_version) + self.logger.info( + f"Using spec-specific mapping version {spec_version} for {spec_id}" + ) + + if not mapping: + return spec_path, spec_payload, None + + # Merge global and spec-type specific mappings + global_mappings = mapping.get(self.Keys.GLOBAL, {}) + spec_type_mappings = mapping.get(spec_type, {}) + key_mappings = {**global_mappings, **spec_type_mappings} + + if key_mappings: + spec_data = self._apply_operations(spec_data, key_mappings, spec_path) + spec_payload[self.Keys.DATA] = spec_data + + return spec_path, spec_payload, None + + except Exception as e: + return spec_path, spec_payload, str(e) + + def _apply_operations( + self, + spec_data: Dict, + mappings: Dict, + spec_path: str + ) -> Dict: + """ + Apply all mapping operations to spec data. + + Operations are applied in order: + 1. move - Copy values to new locations + 2. rename_specific - Rename specific key paths + 3. rename_all - Recursively rename keys + 4. delete - Remove keys (including moved source keys) + """ + rename_all_ops = mappings.get("rename_all", {}) + rename_specific_ops = mappings.get("rename_specific", {}) + move_ops = mappings.get("move", {}) + delete_ops = mappings.get("delete", {}) + + moved_keys = [] + + self.logger.debug(f"Applying mapping to spec: {spec_path}") + + # 1. Move operations (recursive) + if move_ops: + self.logger.debug(f"Applying move operations: {move_ops}") + for src, dest_config in move_ops.items(): + # Parse the destination and condition from config + if isinstance(dest_config, str): + dest = dest_config + condition = None + elif isinstance(dest_config, dict): + dest = dest_config.get("to") + condition = dest_config.get("condition") + else: + continue + + if dest: + # Apply move recursively throughout the spec + spec_data = self._move_key_recursive(spec_data, src, dest, condition) + moved_keys.append(src) + + # 2. Rename specific operations + if rename_specific_ops: + self.logger.debug(f"Applying rename specific operations: {rename_specific_ops}") + for src, dest_config in rename_specific_ops.items(): + dest, should_apply = self._parse_conditional_operation(dest_config, spec_data) + if should_apply and dest: + self._rename_key_specific(spec_data, src, dest) + elif not should_apply: + self.logger.debug(f"Skipping rename '{src}' - condition not met") + + # 3. Rename all operations (recursive) + if rename_all_ops: + self.logger.debug(f"Applying rename all operations: {rename_all_ops}") + filtered_rename_all = {} + for src, dest_config in rename_all_ops.items(): + dest, should_apply = self._parse_conditional_operation(dest_config, spec_data) + if should_apply and dest: + filtered_rename_all[src] = dest + if filtered_rename_all: + spec_data = self._rename_keys_recursive(spec_data, filtered_rename_all) + + # 4. Delete moved source keys (already handled by _move_key_recursive) + # The recursive move operation removes the source key as part of the move + + # 5. Delete explicit keys + if delete_ops: + self.logger.debug(f"Applying delete operations: {delete_ops}") + for key, delete_config in delete_ops.items(): + if isinstance(delete_config, bool) and delete_config: + self._delete_key(spec_data, key) + elif isinstance(delete_config, dict): + _, should_apply = self._parse_conditional_operation(delete_config, spec_data) + if should_apply: + self._delete_key(spec_data, key) + else: + self._delete_key(spec_data, key) + + self.logger.info(f"Mapping applied to spec: {spec_path}") + self.logger.info(f"Mapped spec: {spec_data}") + return spec_data + + # Condition Evaluation + def _parse_conditional_operation( + self, + operation_value: Any, + data: Dict + ) -> Tuple[Optional[str], bool]: + """ + Parse an operation value that may be simple or conditional. + + Simple format: "targetDetails.newKey" + Conditional format: {"to": "...", "condition": {...}} + + Returns: + Tuple of (target_value, should_apply) + """ + if isinstance(operation_value, str): + return operation_value, True + elif isinstance(operation_value, dict): + target = operation_value.get("to") + condition = operation_value.get("condition") + should_apply = self._evaluate_condition(data, condition) + return target, should_apply + else: + self.logger.warning(f"Invalid operation value format: {operation_value}") + return None, False + + def _evaluate_condition(self, data: Dict, condition: Optional[Dict]) -> bool: + """ + Evaluate a condition against spec data. + + Condition format: + { + "key": "sourceType", # dot notation path + "operator": "not_equal_to", # equal_to, not_equal_to, in, not_in + "value": "python" # comparison value + } + """ + if not condition: + return True + + key_path = condition.get("key", "") + operator = condition.get("operator", self.Operators.EQUAL_TO) + expected_value = condition.get("value") + + actual_value = self._get_nested_value(data, key_path) + + if operator == self.Operators.EQUAL_TO: + return actual_value == expected_value + elif operator == self.Operators.NOT_EQUAL_TO: + return actual_value != expected_value + elif operator == self.Operators.IN: + if not isinstance(expected_value, list): + raise ValueError(f"Spec mapping error: Invalid expected value type: {type(expected_value)} specified in condition: {condition}. Expected list.") + return actual_value in expected_value + elif operator == self.Operators.NOT_IN: + if not isinstance(expected_value, list): + raise ValueError(f"Spec mapping error: Invalid expected value type: {type(expected_value)} specified in condition: {condition}. Expected list.") + return actual_value not in expected_value + else: + raise ValueError(f"Spec mapping error: Unknown condition operator: {operator} specified in condition: {condition}.") + + # Key Operations + @staticmethod + def _get_nested_value(data: Dict, key_path: str) -> Any: + """Get a value from a nested dict using dot notation.""" + parts = key_path.split(".") + current = data + for part in parts: + if isinstance(current, dict) and part in current: + current = current[part] + else: + return None + return current + + @staticmethod + def _get_parent_and_key(data: Dict, path: list, create_missing: bool = True) -> Tuple[Optional[Dict], str]: + """ + Return the parent dict and final key name for a path. + + Args: + data: The dictionary to traverse + path: List of keys representing the path + create_missing: If True, create intermediate dicts. If False, return None for parent if path doesn't exist. + + Returns: + Tuple of (parent_dict, final_key). parent_dict is None if create_missing=False and path doesn't exist. + """ + current = data + for k in path[:-1]: + if create_missing: + current = current.setdefault(k, {}) + else: + if not isinstance(current, dict) or k not in current: + return None, path[-1] + current = current[k] + return current, path[-1] + + def _move_key_recursive( + self, + obj: Any, + src_path: str, + dest_path: str, + condition: Optional[Dict], + context: Optional[Dict] = None + ) -> Any: + """ + Recursively move keys throughout the spec structure. + + For a src_path like "sourceDetails.pythonFunctionPath" and dest_path + "sourceDetails.pythonTransform.functionPath", this will find ALL occurrences + of sourceDetails containing pythonFunctionPath and move them. + + The condition is evaluated against the parent context (e.g., the object + containing sourceDetails, which also has sourceType). + """ + if isinstance(obj, dict): + src_parts = src_path.split(".") + dest_parts = dest_path.split(".") + result = {} + + for key, value in obj.items(): + # Check if this is a match: key starts path, value is dict with source key, condition met + if (key == src_parts[0] and + isinstance(value, dict) and + len(src_parts) == 2 and + src_parts[1] in value and + self._evaluate_condition(obj, condition)): + + # Transform: remove source key and create nested destination + new_value = {k: v for k, v in value.items() if k != src_parts[1]} + current = new_value + for dest_key in dest_parts[1:-1]: + current = current.setdefault(dest_key, {}) + current[dest_parts[-1]] = value[src_parts[1]] + + result[key] = self._move_key_recursive(new_value, src_path, dest_path, condition, obj) + else: + result[key] = self._move_key_recursive(value, src_path, dest_path, condition, obj) + + return result + + if isinstance(obj, list): + return [self._move_key_recursive(item, src_path, dest_path, condition, context) for item in obj] + + return obj + + def _move_key(self, data: Dict, src_path: str, dest_path: str) -> None: + """Move a key and its value from src_path to dest_path (non-recursive, for root-level moves).""" + src_parts = src_path.split(".") + dest_parts = dest_path.split(".") + + # Check source exists WITHOUT creating intermediate dicts + src_parent, src_key = self._get_parent_and_key(data, src_parts, create_missing=False) + if src_parent is None or src_key not in src_parent: + return + + value = src_parent[src_key] + # Create destination path (this is intentional) + dest_parent, dest_key = self._get_parent_and_key(data, dest_parts, create_missing=True) + dest_parent[dest_key] = value + + def _rename_key_specific(self, data: Dict, current_key: str, new_key: str) -> None: + """Rename a specific key at an exact path.""" + current_parts = current_key.split(".") + # Check source exists WITHOUT creating intermediate dicts + parent, key = self._get_parent_and_key(data, current_parts, create_missing=False) + if parent is None or key not in parent: + return + + new_key_name = new_key.split(".")[-1] + parent[new_key_name] = parent.pop(key) + + def _rename_keys_recursive(self, obj: Any, key_mapping: Dict) -> Any: + """Recursively rename keys in nested dictionaries.""" + if isinstance(obj, dict): + renamed_obj = {} + for key, value in obj.items(): + new_key = key_mapping.get(key, key) + if new_key != key: + self.logger.debug(f"Renaming key '{key}' to '{new_key}'") + renamed_obj[new_key] = self._rename_keys_recursive(value, key_mapping) + return renamed_obj + elif isinstance(obj, list): + return [self._rename_keys_recursive(item, key_mapping) for item in obj] + else: + return obj + + def _delete_key(self, data: Dict, src_path: str) -> None: + """Delete a key and its value from a nested dict.""" + src_parts = src_path.split(".") + # Check source exists WITHOUT creating intermediate dicts + src_parent, src_key = self._get_parent_and_key(data, src_parts, create_missing=False) + if src_parent is not None and src_key in src_parent: + src_parent.pop(src_key) + diff --git a/src/dataflow_spec_builder/template_processor.py b/src/dataflow_spec_builder/template_processor.py new file mode 100644 index 0000000..91a3d89 --- /dev/null +++ b/src/dataflow_spec_builder/template_processor.py @@ -0,0 +1,368 @@ +import json +import os +import re +from typing import Dict, Any + +from constants import FrameworkPaths, PipelineBundlePaths, SupportedSpecFormat +import pipeline_config +import utility + + +class TemplateProcessor: + """ + Handles template file loading and expansion into concrete dataflow specs. + + This processor is responsible for: + - Loading template definition files from the file system (with caching) + - Finding and validating parameter placeholders + - Expanding templates with provided parameter sets + - Generating multiple concrete specs from a single template definition + + Attributes: + bundle_path (str): Path to the pipeline bundle containing templates + framework_path (str): Path to the framework containing schemas + logger: Logger instance for tracking template processing + + Note: + Templates definitionsare cached after first load to optimize performance when + the same template is used multiple times. + """ + + class DefinitionKeys: + """Constants for dictionary keys for the template dataflow spec JSON files""" + TEMPLATE = "template" + PARAMETER_DEFINITIONS = "parameters" + DATA_FLOW_ID = "dataFlowId" + PARAM_TYPE = "type" + PARAM_REQUIRED = "required" + PARAM_DEFAULT = "default" + + class SpecKeys: + """Constants for dictionary keys for the template dataflow spec JSON files""" + TEMPLATE_NAME = "template" + PARAMETER_SETS = "parameterSets" + DATA_FLOW_ID = "dataFlowId" + TAGS = "tags" + TAG_IS_TEMPLATE_GENERATED = "_isTemplateGenerated" + TAG_TEMPLATE_NAME = "_templateName" + + def __init__(self, bundle_path: str, framework_path: str): + """Initialize the template processor.""" + self.bundle_path = bundle_path + self.framework_path = framework_path + self.logger = pipeline_config.get_logger() + self._pattern = re.compile(r'\$\{param\.([^}]+)\}') + + # Initialize cache for loaded templates + self._template_cache: Dict[str, Dict] = {} + + # Initialize validators for template definitions and template specs + self.template_definition_validator = utility.JSONValidator( + os.path.join(self.framework_path, FrameworkPaths.TEMPLATE_DEFINITION_SPEC_SCHEMA_PATH) + ) + self.template_spec_validator = utility.JSONValidator( + os.path.join(self.framework_path, FrameworkPaths.TEMPLATE_SPEC_SCHEMA_PATH) + ) + + def process_template_spec( + self, + file_path: str, + template_spec: Dict, + spec_file_format: str = SupportedSpecFormat.JSON.value + ) -> Dict[str, Dict]: + """ + Process a template spec into multiple concrete specs based on provided parameters. + + Takes a template definition that includes: + - A template name referencing a template file + - A list of parameter sets + + Returns a dictionary where each key is a unique identifier and each value + is a fully expanded concrete spec with all parameters substituted. + + Args: + file_path: Path to the original template spec file (used for tracking) + template_spec: Dictionary containing a template dataflow spec + + Returns: + Dict[str, Dict]: Dictionary mapping unique identifiers to expanded specs + + Raises: + ValueError: If template name is missing, no parameters provided, or parameters are invalid + FileNotFoundError: If the referenced template file cannot be found + KeyError: If a required parameter is missing from a parameter set + """ + # Validate template definition against schema + validation_errors = self.template_spec_validator.validate(template_spec) + if validation_errors: + error_msg = f"Template definition validation failed for {file_path}:\n{validation_errors}" + self.logger.error(error_msg) + raise ValueError(error_msg) + + template_name = template_spec.get(self.SpecKeys.TEMPLATE_NAME) + dataflow_spec_params = template_spec.get(self.SpecKeys.PARAMETER_SETS, []) + + # Validate template spec structure + if not template_name: + error_msg = "Template name must be provided in template spec" + self.logger.error(error_msg) + raise ValueError(error_msg) + + if not dataflow_spec_params: + error_msg = f"Dataflow specs must be provided for template: {template_name}" + self.logger.error(error_msg) + raise ValueError(error_msg) + + # Load template content and validate against schema + template_definition = self._get_template_definition(template_name, spec_file_format) + + # Get and validate parameter definitions + param_definitions = self._get_template_parameters(template_definition, template_name) + + # Generat Spec for each parameter set + processed_specs = {} + spec_template = template_definition.get(self.DefinitionKeys.TEMPLATE, {}) + for params in dataflow_spec_params: + dataflow_id = params.get(self.SpecKeys.DATA_FLOW_ID) + spec_key = f'{file_path}#template_{template_name}_{dataflow_id}' + + # Validate and apply defaults to parameters + processed_params = self._validate_and_apply_defaults( + params, param_definitions, spec_key + ) + + # Generate Spec with this parameter set + try: + generated_spec = self._generate_spec(spec_template, processed_params) + + # Explicitly mark this spec as template-generated for downstream processing + tags = generated_spec.get(self.SpecKeys.TAGS, {}) + tags[self.SpecKeys.TAG_IS_TEMPLATE_GENERATED] = True + tags[self.SpecKeys.TAG_TEMPLATE_NAME] = template_name + generated_spec[self.SpecKeys.TAGS] = tags + + # Add spec + processed_specs[spec_key] = generated_spec + self.logger.debug(f"Generated spec from template: '{spec_key}'") + except Exception as e: + error_msg = f"Failed to generate spec from template '{spec_key}': {str(e)}" + self.logger.error(error_msg) + raise ValueError(error_msg) from e + + self.logger.info( + f"Successfully generated {len(processed_specs)} specs from template '{template_name}'" + ) + + return processed_specs + + def _get_template_definition(self, template_name: str, spec_file_format: str) -> Dict: + """Load a template file from the file system with caching.""" + if template_name in self._template_cache: + self.logger.debug(f"Using cached template definition: {template_name}") + return self._template_cache[template_name] + + base_path = os.path.join( + self.bundle_path, + PipelineBundlePaths.TEMPLATE_PATH, + template_name + ) + template_path = self._resolve_template_path(base_path, spec_file_format) + if not template_path: + error_msg = f"Template not found: {base_path} for format: {self.spec_file_format.upper()}" + self.logger.error(error_msg) + raise FileNotFoundError(error_msg) + + template_definition = utility.load_config_file(template_path, spec_file_format, True) + template_validation_errors = self.template_definition_validator.validate(template_definition) + if template_validation_errors: + error_msg = f"Template file validation failed for template '{template_name}':\n{template_validation_errors}" + self.logger.error(error_msg) + raise ValueError(error_msg) + + self.logger.info(f"Loaded template definition '{template_name}' from: {template_path}") + self._template_cache[template_name] = template_definition + + return template_definition + + def _resolve_template_path(self, base_path: str, spec_file_format: str) -> str | None: + """Resolve the full path to a template file, handling both .yaml and .yml extensions.""" + # For YAML format, check both .yaml and .yml extensions + if spec_file_format in ('yaml', 'yml'): + for ext in ('yaml', 'yml'): + file_path = f"{base_path}.{ext}" + if os.path.exists(file_path): + return file_path + return None + + # For other formats (e.g., JSON), use the format directly + file_path = f"{base_path}.{spec_file_format}" + if os.path.exists(file_path): + return file_path + + return None + + def clear_cache(self) -> None: + """Clear the template cache. Used for testing or when templates need to be reloaded from disk after being modified.""" + cache_size = len(self._template_cache) + self._template_cache.clear() + self.logger.info(f"Cleared template cache ({cache_size} templates)") + + def get_cache_info(self) -> Dict[str, int]: + """Get information about the template cache. Used for testing or when templates need to be reloaded from disk after being modified.""" + return { + "cached_templates": len(self._template_cache), + "template_names": list(self._template_cache.keys()) + } + + def _get_template_parameters(self, template_definition: Dict, template_name: str) -> Dict: + """Get parameter definitions from template and validate against template content.""" + param_definitions = template_definition.get(self.DefinitionKeys.PARAMETER_DEFINITIONS, {}) + + if not param_definitions: + error_msg = f"Template '{template_name}' contains no parameters" + self.logger.error(error_msg) + raise ValueError(error_msg) + + if "dataFlowId" not in param_definitions: + error_msg = f"Template '{template_name}' must declare 'dataFlowId' parameter" + self.logger.error(error_msg) + raise ValueError(error_msg) + + # Extract template content and find all parameter placeholders used in it + template_content = template_definition.get(self.DefinitionKeys.TEMPLATE, {}) + template_string = json.dumps(template_content) + param_placeholders = re.findall(r'\$\{param\.([^}]+)\}', template_string) + unique_placeholders = set(param_placeholders) + + # Validate all placeholders have definitions + undefined_params = unique_placeholders - set(param_definitions.keys()) + if undefined_params: + error_msg = f"Template '{template_name}' uses undefined parameters: {sorted(undefined_params)}" + self.logger.error(error_msg) + raise ValueError(error_msg) + + # Warn about unused parameter definitions + unused_params = set(param_definitions.keys()) - unique_placeholders + if unused_params: + self.logger.warning( + f"Template '{template_name}' has unused parameter definitions: {sorted(unused_params)}" + ) + + self.logger.debug( + f"Template '{template_name}' parameter definitions: {json.dumps(param_definitions, indent=4)}" + ) + + return param_definitions + + def _validate_and_apply_defaults( + self, + params: Dict, + param_definitions: Dict, + spec_key: str + ) -> Dict: + """Validate provided parameters against definitions and apply defaults.""" + processed_params = {} + missing_required = [] + + for param_name, param_def in param_definitions.items(): + param_type = param_def.get(self.DefinitionKeys.PARAM_TYPE) + is_required = param_def.get(self.DefinitionKeys.PARAM_REQUIRED, True) + default_value = param_def.get(self.DefinitionKeys.PARAM_DEFAULT, None) + + if param_name in params: + param_value = params[param_name] + if not self._validate_parameter_type(param_value, param_type): + error_msg = ( + f"Error validating parameter '{param_name}' in spec '{spec_key}': " + f"Expected type '{param_type}', got '{type(param_value).__name__}'" + ) + self.logger.error(error_msg) + raise ValueError(error_msg) + processed_params[param_name] = param_value + + else: + if is_required: + missing_required.append(param_name) + elif default_value is not None: + try: + processed_params[param_name] = default_value + self.logger.debug( + f"Applied default value for parameter '{param_name}': {default_value}" + ) + except ValueError as e: + error_msg = f"Error applying default value for parameter '{param_name}' in spec '{spec_key}': {str(e)}" + self.logger.error(error_msg) + raise ValueError(error_msg) from e + # If not required and no default, skip it + + if missing_required: + error_msg = f"Generated spec '{spec_key}' is missing required parameters: {missing_required}" + self.logger.error(error_msg) + raise ValueError(error_msg) + + return processed_params + + def _validate_parameter_type( + self, + param_value: Any, + expected_type: str + ) -> bool: + """Validate a parameter value matches its declared type.""" + type_map = { + "string": str, + "integer": int, + "boolean": bool, + "list": list, + "object": dict + } + + expected_python_type = type_map.get(expected_type) + if expected_python_type is None: + return False + + return isinstance(param_value, expected_python_type) + + def _generate_spec(self, obj, params: Dict): + """Recursively process a value, replacing parameter placeholders.""" + if isinstance(obj, dict): + result = { + self._generate_spec(k, params): self._generate_spec(v, params) + for k, v in obj.items() + } + # Validate all keys are strings + if non_string_keys := [k for k in result.keys() if not isinstance(k, str)]: + error_msg = f"Dictionary keys must be strings, found: {non_string_keys}" + self.logger.error(error_msg) + raise ValueError(error_msg) + return result + + if isinstance(obj, list): + return [self._generate_spec(item, params) for item in obj] + + if isinstance(obj, str): + return self._replace_string_placeholders(obj, params) + + return obj + + def _replace_string_placeholders(self, text: str, params: Dict): + """Replace parameter placeholders in a string value.""" + # Full replacement: entire string is a single placeholder + if (m := self._pattern.fullmatch(text)): + return self._get_param_value(m.group(1), params) + + # Partial replacement: string contains placeholders + if self._pattern.search(text): + return self._pattern.sub( + lambda m: str(self._get_param_value(m.group(1), params)), + text + ) + + return text + + def _get_param_value(self, param_key: str, params: Dict): + """Get a parameter value from the params dictionary.""" + if param_key not in params: + error_msg = f"Parameter '{param_key}' not found in params" + self.logger.error(error_msg) + raise KeyError(error_msg) + return params[param_key] diff --git a/src/dataflow_spec_builder/transformer/__init__.py b/src/dataflow_spec_builder/transformer/__init__.py new file mode 100644 index 0000000..d182f34 --- /dev/null +++ b/src/dataflow_spec_builder/transformer/__init__.py @@ -0,0 +1,13 @@ +from .base import BaseSpecTransformer +from .standard import StandardSpecTransformer +from .flow import FlowSpecTransformer +from .materialized_views import MaterializedViewSpecTransformer +from .factory import SpecTransformerFactory + +__all__ = [ + 'BaseSpecTransformer', + 'StandardSpecTransformer', + 'FlowSpecTransformer', + 'MaterializedViewSpecTransformer', + 'SpecTransformerFactory' +] diff --git a/src/dataflow_spec_builder/transformer/base.py b/src/dataflow_spec_builder/transformer/base.py new file mode 100644 index 0000000..3983efd --- /dev/null +++ b/src/dataflow_spec_builder/transformer/base.py @@ -0,0 +1,40 @@ +# src/dataflow_spec/spec_type/base.py +from abc import ABC, abstractmethod +from typing import Dict, List, Union + +import pipeline_config + + +class BaseSpecTransformer(ABC): + """Base class for dataflow spec transformers.""" + + def __init__(self): + self.logger = pipeline_config.get_logger() + + @abstractmethod + def _process_spec(self, spec_data: Dict) -> Union[Dict, List[Dict]]: + """Transform the spec data. Returns either a single Dict or List[Dict].""" + pass + + def transform(self, spec_data: Dict) -> Union[Dict, List[Dict]]: + """Transform the spec data. Returns either a single Dict or List[Dict].""" + return self._process_spec(spec_data) + + def _apply_features_and_limitations(self, dataflow_spec: Dict) -> Dict: + """Apply common features and limitations transformations.""" + # Operational MetadataSnapshot + features = dataflow_spec.get("features", {}) + + if not features: + dataflow_spec["features"] = {} + + # FEATURE: Operational Metadata + operational_metadata_enabled = features.get("operationalMetadataEnabled", None) + if not operational_metadata_enabled: + dataflow_spec["features"]["operationalMetadataEnabled"] = True + + # LIMITATIONS: CDC SNAPSHOT + if dataflow_spec.get("cdcSnapshotSettings"): + dataflow_spec["features"]["operationalMetadataEnabled"] = False + + return dataflow_spec diff --git a/src/dataflow_spec_builder/transformer/factory.py b/src/dataflow_spec_builder/transformer/factory.py new file mode 100644 index 0000000..a1c8b8b --- /dev/null +++ b/src/dataflow_spec_builder/transformer/factory.py @@ -0,0 +1,29 @@ +from typing import List +from .base import BaseSpecTransformer +from .standard import StandardSpecTransformer +from .flow import FlowSpecTransformer +from .materialized_views import MaterializedViewSpecTransformer + + +class SpecTransformerFactory: + """Factory for creating dataflow spec transformers.""" + + _transformers = { + "standard": StandardSpecTransformer, + "flow": FlowSpecTransformer, + "materialized_view": MaterializedViewSpecTransformer + } + + @classmethod + def create_transformer(cls, dataflow_type: str, dataflow_spec_mapping_path: str = None) -> BaseSpecTransformer: + """Create appropriate transformer based on dataflow type.""" + transformer_class = cls._transformers.get(dataflow_type.lower()) + if not transformer_class: + raise ValueError(f"Unknown dataflow type: {dataflow_type}") + + return transformer_class() + + @classmethod + def get_supported_types(cls) -> List[str]: + """Return list of supported dataflow types.""" + return list(cls._transformers.keys()) \ No newline at end of file diff --git a/src/dataflow_spec_builder/transformer/flow.py b/src/dataflow_spec_builder/transformer/flow.py new file mode 100644 index 0000000..7ad2675 --- /dev/null +++ b/src/dataflow_spec_builder/transformer/flow.py @@ -0,0 +1,15 @@ +from typing import Dict + +from .base import BaseSpecTransformer + +class FlowSpecTransformer(BaseSpecTransformer): + """Transform a flow dataflow specification.""" + + def _process_spec(self, spec_data: Dict) -> Dict: + """Transform a flow dataflow specification.""" + # Set the target table type + target_format = spec_data.get("targetFormat") + if target_format == "delta": + spec_data["targetDetails"]["type"] = "st" + + return spec_data \ No newline at end of file diff --git a/src/dataflow_spec_builder/transformer/materialized_views.py b/src/dataflow_spec_builder/transformer/materialized_views.py new file mode 100644 index 0000000..1a3f11c --- /dev/null +++ b/src/dataflow_spec_builder/transformer/materialized_views.py @@ -0,0 +1,124 @@ +from typing import Dict, List + +from .base import BaseSpecTransformer +from dataflow.enums import FlowType, Mode, SourceType, TargetType, TableType + + +class MaterializedViewSpecTransformer(BaseSpecTransformer): + """Transform a materialized view dataflow specification into a flow specification.""" + + # Constants for values not available in enums + MAIN_FLOW_GROUP_ID = "main" + FLOW_NAME_PREFIX = "f_" + + def _process_spec(self, spec_data: Dict) -> List[Dict]: + """Transform a materialized view dataflow specification into flow specifications.""" + materialized_views = spec_data.get("materializedViews", {}) + + if not materialized_views: + self.logger.warning("No materialized views found in dataflow spec") + return [] + + mv_specs = [] + for mv_name, mv_config in materialized_views.items(): + try: + flow_spec = self._create_flow_spec(mv_name, mv_config, spec_data) + mv_specs.append(flow_spec) + + except Exception as e: + self.logger.error(f"Error transforming materialized view '{mv_name}': {str(e)}") + raise + + return mv_specs + + def _create_flow_spec(self, mv_name: str, mv_config: Dict, spec_data: Dict) -> Dict: + """Create a complete flow specification for a materialized view.""" + if not mv_name or not isinstance(mv_name, str): + raise ValueError(f"Invalid materialized view name: {mv_name}") + + target_details = self._build_target_details(mv_name, mv_config) + flow_spec = self._build_base_flow_spec(spec_data, mv_config, target_details) + flow_group = self._create_mv_flow_group(mv_name, mv_config, target_details) + flow_spec["flowGroups"] = [flow_group] + + return flow_spec + + def _build_target_details(self, mv_name: str, mv_config: Dict) -> Dict: + """Build target details for the materialized view.""" + target_details = mv_config.get("tableDetails", {}).copy() + target_details.update({ + "table": mv_name, + "type": TableType.MATERIALIZED_VIEW, + "sqlPath": mv_config.get("sqlPath"), + "sqlStatement": mv_config.get("sqlStatement") + }) + return target_details + + def _build_base_flow_spec(self, spec_data: Dict, mv_config: Dict, target_details: Dict) -> Dict: + """Build the base flow specification structure.""" + return { + "dataFlowId": spec_data.get("dataFlowId"), + "dataFlowGroup": spec_data.get("dataFlowGroup"), + "dataFlowType": spec_data.get("dataFlowType"), + "targetFormat": TargetType.DELTA, + "targetDetails": target_details, + "quarantineMode": mv_config.get("quarantineMode"), + "quarantineTargetDetails": mv_config.get("quarantineTargetDetails"), + "dataQualityExpectationsEnabled": mv_config.get("dataQualityExpectationsEnabled", False), + "dataQualityExpectationsPath": mv_config.get("dataQualityExpectationsPath"), + "localPath": spec_data.get("localPath") + } + + def _create_mv_flow_group(self, mv_name: str, mv_config: Dict, target_details: Dict) -> Dict: + """Create a flow group for a materialized view.""" + flow_group = { + "flowGroupId": self.MAIN_FLOW_GROUP_ID, + "flows": {} + } + + source_view = mv_config.get("sourceView", {}) + source_view_name = source_view.get("sourceViewName") + + # Return empty flow group if no source view + if not source_view or not source_view_name: + return flow_group + + # Update target details with source view + target_details["sourceView"] = source_view_name + + # Create and add flow + flow_name = f"{self.FLOW_NAME_PREFIX}{source_view_name}" + flow = self._build_materialized_view_flow(mv_name, source_view_name, source_view) + flow_group["flows"][flow_name] = flow + + return flow_group + + def _build_materialized_view_flow(self, mv_name: str, source_view_name: str, source_view: Dict) -> Dict: + """Build a materialized view flow configuration.""" + flow = { + "flowType": FlowType.MATERIALIZED_VIEW, + "flowDetails": { + "sourceView": source_view_name, + "targetTable": mv_name + }, + "enabled": True + } + + # Configure source details + source_type = source_view.get("sourceType") + source_details = source_view.get("sourceDetails", {}).copy() + + # Ensure CDF is disabled for delta sources + if source_type == SourceType.DELTA: + source_details["cdfEnabled"] = False + + # Add views to the flow + flow["views"] = { + source_view_name: { + "mode": Mode.BATCH, + "sourceType": source_type, + "sourceDetails": source_details + } + } + + return flow \ No newline at end of file diff --git a/src/dataflow_spec_builder/transformer/standard.py b/src/dataflow_spec_builder/transformer/standard.py new file mode 100644 index 0000000..a592d41 --- /dev/null +++ b/src/dataflow_spec_builder/transformer/standard.py @@ -0,0 +1,121 @@ +from typing import Dict + +from .base import BaseSpecTransformer +from dataflow.enums import FlowType, Mode, TableType + + +class StandardSpecTransformer(BaseSpecTransformer): + """Transform a standard dataflow specification into a flow specification.""" + + # Constants for values not available in enums + MAIN_FLOW_GROUP_ID = "main" + FLOW_NAME_PREFIX = "f_" + + def _process_spec(self, spec_data: Dict) -> Dict: + """Transform a standard dataflow specification into a flow specification.""" + # Determine processing configuration + mode = spec_data.get("mode", Mode.STREAM).lower() + has_cdc_settings = spec_data.get("cdcSettings") is not None + has_cdc_snapshot_settings = spec_data.get("cdcSnapshotSettings") is not None + + # Set target table type if delta + target_format = spec_data.get("targetFormat") + if target_format == "delta": + target_type = TableType.STREAMING if mode == Mode.STREAM or has_cdc_snapshot_settings else TableType.MATERIALIZED_VIEW + spec_data["targetDetails"]["type"] = target_type + + # Create base flow spec + flow_spec = self._build_base_flow_spec(spec_data) + + # Create flow group with main flow + flow_group = self._create_flow_group(spec_data, mode, has_cdc_settings, has_cdc_snapshot_settings) + flow_spec["flowGroups"] = [flow_group] + + # Add CDC configuration if present + self._add_cdc_configuration(flow_spec, spec_data) + + return flow_spec + + def _build_base_flow_spec(self, spec_data: Dict) -> Dict: + """Build the base flow specification structure.""" + return { + "dataFlowId": spec_data.get("dataFlowId"), + "dataFlowGroup": spec_data.get("dataFlowGroup"), + "dataFlowType": spec_data.get("dataFlowType"), + "targetFormat": spec_data.get("targetFormat"), + "targetDetails": spec_data.get("targetDetails"), + "quarantineMode": spec_data.get("quarantineMode"), + "quarantineTargetDetails": spec_data.get("quarantineTargetDetails"), + "dataQualityExpectationsEnabled": spec_data.get("dataQualityExpectationsEnabled", False), + "dataQualityExpectationsPath": spec_data.get("dataQualityExpectationsPath"), + "dataQualityExpectations": spec_data.get("dataQualityExpectations"), + "tableMigrationDetails": spec_data.get("tableMigrationDetails"), + "localPath": spec_data.get("localPath") + } + + def _create_flow_group(self, spec_data: Dict, mode: str, has_cdc_settings: bool, has_cdc_snapshot_settings: bool) -> Dict: + """Create the main flow group with flows.""" + flow_group = { + "flowGroupId": self.MAIN_FLOW_GROUP_ID, + "flows": {} + } + + # Get target and source details + source_view_name = spec_data.get("sourceViewName") + target_details = spec_data.get("targetDetails", {}) + target_format = spec_data.get("targetFormat") + if target_format == "delta": + target_database = target_details.get("database", None) + target_table = target_details.get("table", None) + target_table = target_table if target_database is None else f"{target_database}.{target_table}" + elif target_format.lower().endswith("sink"): + target_table = target_details.get("name") + else: + raise ValueError(f"Standard Spec Transformation: Unknown target format: {target_format}") + + # Create flow name and type + flow_name = self._create_flow_name(source_view_name, target_table, has_cdc_snapshot_settings, spec_data) + flow_type = FlowType.MERGE if has_cdc_settings or has_cdc_snapshot_settings else FlowType.APPEND_VIEW + + # Build flow + flow = { + "flowType": flow_type, + "flowDetails": { + "sourceView": source_view_name, + "targetTable": target_table + }, + "enabled": True + } + flow_group["flows"][flow_name] = flow + + # Add views if source view exists + if source_view_name: + flow["views"] = self._build_views(spec_data, source_view_name, mode) + + return flow_group + + def _create_flow_name(self, source_view_name: str, target_table: str, has_cdc_snapshot_settings: bool, spec_data: Dict) -> str: + """Create appropriate flow name based on configuration.""" + if has_cdc_snapshot_settings: + snapshot_type = spec_data.get("cdcSnapshotSettings", {}).get("snapshotType") + if snapshot_type == "historical": + return f"f_historical_snapshot_for_{target_table}" + + return f"{self.FLOW_NAME_PREFIX}{source_view_name}" + + def _build_views(self, spec_data: Dict, source_view_name: str, mode: str) -> Dict: + """Build views configuration for the flow.""" + return { + source_view_name: { + "mode": mode, + "sourceType": spec_data.get("sourceType"), + "sourceDetails": spec_data.get("sourceDetails", {}), + } + } + + def _add_cdc_configuration(self, flow_spec: Dict, spec_data: Dict) -> None: + """Add CDC configuration to flow spec if present.""" + if spec_data.get("cdcSettings"): + flow_spec["cdcSettings"] = spec_data["cdcSettings"] + if spec_data.get("cdcSnapshotSettings"): + flow_spec["cdcSnapshotSettings"] = spec_data["cdcSnapshotSettings"] diff --git a/src/dlt_pipeline.ipynb b/src/dlt_pipeline.ipynb new file mode 100644 index 0000000..60c16eb --- /dev/null +++ b/src/dlt_pipeline.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Pipeline Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from constants import DLTPipelineSettingKeys\n", + "\n", + "def get_required_config(key: str, description: str) -> str:\n", + " \"\"\"Get a required config value from spark.conf or raise an error.\"\"\"\n", + " value = spark.conf.get(key, None)\n", + " if value is None:\n", + " raise ValueError(f\"Pipeline settings error: {description} is not set.\")\n", + " return value\n", + "\n", + "framework_source_path = get_required_config(\n", + " DLTPipelineSettingKeys.FRAMEWORK_SOURCE_PATH, \"Framework source path\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lakeflow Framework Entry Point" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Append the framework path to the system path\n", + "if framework_source_path not in sys.path:\n", + " sys.path.append(framework_source_path)\n", + "\n", + "# Import and initialize the Spark Declarative Pipeline builder\n", + "from dlt_pipeline_builder import DLTPipelineBuilder\n", + "DLTPipelineBuilder(spark, dbutils).initialize_pipeline()" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "dlt_pipeline", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/src/dlt_pipeline_builder.py b/src/dlt_pipeline_builder.py new file mode 100644 index 0000000..0a9f281 --- /dev/null +++ b/src/dlt_pipeline_builder.py @@ -0,0 +1,435 @@ +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime, timezone +import json +import os +import sys + +from pyspark import pipelines as dp +from pyspark.dbutils import DBUtils +import pyspark.sql.types as T +from pyspark.sql import SparkSession +from typing import Dict, Any + +from constants import( + FrameworkPaths, FrameworkSettings, PipelineBundlePaths, DLTPipelineSettingKeys, SupportedSpecFormat +) +from dataflow import DataFlow +from dataflow_spec_builder import DataflowSpecBuilder +from pipeline_details import PipelineDetails +from secrets_manager import SecretsManager +from substitution_manager import SubstitutionManager + +import pipeline_config +import utility + + +class DLTPipelineBuilder: + """ + Initializes a dataflow in a Spark Declarative Pipeline based on the pipeline configuration and dataflow specifications. + + Args: + spark (SparkSession): The Spark session to use for the pipeline. + dbutils (DBUtils): The DBUtils to use for the pipeline. + + Attributes: + spark (SparkSession): The Spark session to use for the pipeline. + dbutils (DBUtils): The DBUtils to use for the pipeline. + + logger (Logger): The logger to use for the pipeline. + context (NotebookContext): The notebook context to use for the pipeline. + token (str): The token to use for the pipeline. + + pipeline_config (Dict[str, Any]): The pipeline configuration to use for the pipeline. + framework_path (str): The path to the framework to use for the pipeline. + bundle_path (str): The path to the bundle to use for the pipeline. + dataflow_path (str): The path to the dataflow to use for the pipeline. + workspace_host (str): The host to use for the pipeline. + + dataflow_specs (List[DataflowSpec]): The dataflow specifications to use for the pipeline. + dataflow_spec_filters (Dict[str, Any]): The filters to use for the dataflow specifications. + + pipeline_details (PipelineDetails): The pipeline details to use for the pipeline. + mandatory_table_properties (Dict[str, Any]): The mandatory table properties to use for the pipeline. + operational_metadata_schema (StructType): The operational metadata schema to use for the pipeline. + substitution_manager (SubstitutionManager): The substitution manager to use for the pipeline. + + Methods: + initialize_pipeline(): Initializes a dataflow in a Spark Declarative Pipeline. + """ + + MANDATORY_CONFIG_PARAMS = [ + DLTPipelineSettingKeys.BUNDLE_SOURCE_PATH, + DLTPipelineSettingKeys.FRAMEWORK_SOURCE_PATH, + DLTPipelineSettingKeys.WORKSPACE_HOST + ] + + def __init__(self, spark: SparkSession, dbutils: DBUtils): + """Initialize the pipeline builder with Spark session and utilities.""" + # Initialize Spark context + self.spark = spark + self.dbutils = dbutils + self.context = self.dbutils.entry_point.getDbutils().notebook().getContext() + self.token = self.context.apiToken().get() + + self.pipeline_bundle_spec_format = "json" # Default to JSON format + self.dataflow_specs = [] + self.dataflow_spec_filters = {} + self.mandatory_table_properties = {} + self.operational_metadata_schema = None + self.pipeline_config = {} + self.secrets_manager = None + self.substitution_manager = None + self.driver_cores = os.cpu_count() + self.default_max_workers = self.driver_cores - 1 if self.driver_cores else 1 + + # Initialize logger + log_level = self.spark.conf.get(DLTPipelineSettingKeys.LOG_LEVEL, "INFO").upper() + self.logger = utility.set_logger("DltFramework", log_level) + self.logger.info("Initializing Pipeline...") + self.logger.info("Logical cores (threads): %s", self.driver_cores) + self.logger.info("Max workers: %s", self.default_max_workers) + + # Initialize core singletons + pipeline_config.initialize_core( + spark=self.spark, + dbutils=self.dbutils, + logger=self.logger, + ) + + # Load configurations and initialize components + self._init_configurations() + self._init_pipeline_components() + + def _init_configurations(self) -> None: + """Load and validate all necessary configurations.""" + # Load mandatory parameters + config_values = { + param: self.spark.conf.get(param, None) + for param in self.MANDATORY_CONFIG_PARAMS + } + + missing_params = [param for param, value in config_values.items() if not value] + if missing_params: + raise ValueError(f"Missing mandatory config parameters: {missing_params}") + + self.bundle_path = config_values[DLTPipelineSettingKeys.BUNDLE_SOURCE_PATH] + self.framework_path = config_values[DLTPipelineSettingKeys.FRAMEWORK_SOURCE_PATH] + self.workspace_host = config_values[DLTPipelineSettingKeys.WORKSPACE_HOST] + + # Load optional parameters + ignore_validation_errors = self.spark.conf.get( + DLTPipelineSettingKeys.PIPELINE_IGNORE_VALIDATION_ERRORS, "false" + ) + self.ignore_validation_errors = (ignore_validation_errors.lower() == "true") + + # Load pipeline details + self.logger.info("Loading Pipeline Details...") + self.pipeline_details = PipelineDetails( + pipeline_id=self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_ID, None), + pipeline_catalog=self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_CATALOG, None), + pipeline_schema=( + self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_SCHEMA, None) + or self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_TARGET, None) + ), + pipeline_layer=self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_LAYER, None), + start_utc_timestamp=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f'), + workspace_env=self.spark.conf.get(DLTPipelineSettingKeys.BUNDLE_TARGET, None), + logical_env=self.spark.conf.get(DLTPipelineSettingKeys.LOGICAL_ENV, "") + ) + self.logger.info("Pipeline Details: %s", json.dumps(self.pipeline_details.__dict__, indent=4)) + + # Initialize pipeline details singleton + pipeline_config.initialize_pipeline_details(self.pipeline_details) + + # Get the pipeline update id via event hook (only method at the moment) + # This only gets populated post initiliazation. Currently retrieved in operational_metadata.py + @dp.on_event_hook + def update_id_hook(event): + event_type = event.get("event_type") + if event_type == "create_update": + self.spark.conf.set("pipeline.pipeline_update_id", f'{event.get("origin", {}).get("update_id", "")}') + + # Load dataflow filters + self.logger.info("Loading Dataflow Filters...") + self.dataflow_spec_filters = { + "data_flow_ids": self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_FILTER_DATA_FLOW_ID, None), + "data_flow_groups": self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_FILTER_DATA_FLOW_GROUP, None), + "flow_group_ids": self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_FILTER_FLOW_GROUP_ID, None), + "target_tables": self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_FILTER_TARGET_TABLE, None), + "files": self.spark.conf.get(DLTPipelineSettingKeys.PIPELINE_FILE_FILTER, None), + } + + def _init_pipeline_components(self) -> None: + """Initialize all pipeline components.""" + + # Load and merge configurations + self._load_merged_config() + + # Initialize substitution manager + self._init_substitution_manager() + + # Initialize secrets manager + self._init_secrets_manager() + + # Preload shared Python modules + self._preload_extensions() + + # Initialize dataflow specifications + self._init_dataflow_specs() + + # Setup operational metadata + self._setup_operational_metadata() + + # Apply Spark configurations + self._apply_spark_config() + + def _load_framework_global_config_file(self) -> Dict[str, Any]: + """Load a global config file""" + global_config_paths = [os.path.join(self.framework_path, path) for path in FrameworkPaths.GLOBAL_CONFIG] + + # Check if more than one global config exists + existing_configs = [path for path in global_config_paths if os.path.exists(path)] + if len(existing_configs) > 1: + raise ValueError(f"Multiple framework global config files found. Only one is allowed: {existing_configs}") + + if not existing_configs: + raise FileNotFoundError(f"Framework global config file not found, in path: {global_config_paths}") + + global_config_path = existing_configs[0] + self.logger.info("Retrieving Global Framework Config From: %s", global_config_path) + return utility.load_config_file_auto(global_config_path, False) or {} + + def _load_pipeline_bundle_global_config_file(self) -> Dict[str, Any]: + """Load a global config file""" + global_config_paths = [ + os.path.join(self.bundle_path, PipelineBundlePaths.PIPELINE_CONFIGS_PATH, path) for path in PipelineBundlePaths.GLOBAL_CONFIG_FILE + ] + + # Check if more than one global config exists + existing_configs = [path for path in global_config_paths if os.path.exists(path)] + if len(existing_configs) > 1: + raise ValueError(f"Multiple pipeline global config files found. Only one is allowed: {existing_configs}") + + if not existing_configs: + return {} + + pipeline_config_path = existing_configs[0] + self.logger.info("Retrieving Pipeline Global Config From: %s", pipeline_config_path) + return utility.get_json_from_file(pipeline_config_path, False) or {} + + def _load_merged_config(self) -> None: + """Load and merge global and pipeline-specific configurations.""" + self.pipeline_config = self._load_framework_global_config_file() + pipeline_bundle_config = self._load_pipeline_bundle_global_config_file() + + # Initialize pipeline bundle spec format + self._init_pipeline_bundle_spec_format(pipeline_bundle_config) + + # Merge pipeline bundle config + if pipeline_bundle_config: + self.pipeline_config.update(pipeline_bundle_config) + + self.mandatory_table_properties = self.pipeline_config.get("mandatory_table_properties", {}) + + # Initialize mandatory table properties singleton + pipeline_config.initialize_mandatory_table_properties(self.mandatory_table_properties) + + # Initialize mandatory configuration singleton + pipeline_config.initialize_mandatory_configuration() + + # Initialize table migration state volume path + pipeline_config.initialize_table_migration(self.pipeline_config.get("table_migration_state_volume_path", None)) + + def _init_pipeline_bundle_spec_format(self, pipeline_bundle_config: Dict[str, Any]) -> None: + """Initialize the pipeline bundle spec format.""" + valid_formats = [fmt.value for fmt in SupportedSpecFormat] + + # Process global format configuration + global_format_dict = self.pipeline_config.pop("pipeline_bundle_spec_format", None) + if global_format_dict: + self.logger.info("Global pipeline bundle spec format: %s", global_format_dict) + global_format = global_format_dict.get("format", "json") + + if global_format not in valid_formats: + raise ValueError(f"Invalid pipeline bundle spec format: {global_format}. Valid formats are: {valid_formats}") + + self.pipeline_bundle_spec_format = global_format + allow_override = global_format_dict.get("allow_override", False) + else: + allow_override = False + + # Process pipeline-specific format configuration + pipeline_format_dict = pipeline_bundle_config.pop("pipeline_bundle_spec_format", None) + if pipeline_format_dict: + self.logger.info("Pipeline bundle spec format: %s", pipeline_format_dict) + pipeline_format = pipeline_format_dict.get("format", None) + + if pipeline_format and pipeline_format not in valid_formats: + raise ValueError(f"Invalid pipeline bundle spec format: {pipeline_format}. Valid formats are: {valid_formats}") + + if pipeline_format and pipeline_format != self.pipeline_bundle_spec_format and not allow_override: + raise ValueError(f"Pipeline bundle spec format has been set at global framework level as {self.pipeline_bundle_spec_format}. Override has been disabled.") + + if pipeline_format and allow_override: + self.pipeline_bundle_spec_format = pipeline_format + + self.logger.info("Pipeline bundle spec format: %s", self.pipeline_bundle_spec_format) + + def _init_substitution_manager(self) -> None: + """Initialize the substitution manager.""" + self.logger.info("Initializing Substitution Manager...") + + workspace_env = self.pipeline_details.workspace_env or "" + + # Build framework substitutions paths + framework_subs_paths = [ + os.path.join(self.framework_path, FrameworkPaths.CONFIG_PATH, workspace_env + path) + for path in FrameworkPaths.GLOBAL_SUBSTITUTIONS + ] + self.logger.info("Framework substitutions paths: %s", framework_subs_paths) + + # Build pipeline substitutions paths + suffixes = utility.get_format_suffixes(self.pipeline_bundle_spec_format, "substitutions") + pipeline_subs_paths = [os.path.join( + self.bundle_path, PipelineBundlePaths.PIPELINE_CONFIGS_PATH, workspace_env + suffix + ) for suffix in suffixes + ] + self.logger.info("Pipeline substitutions paths: %s", pipeline_subs_paths) + + self.substitution_manager = SubstitutionManager( + framework_substitutions_paths=framework_subs_paths, + pipeline_substitutions_paths=pipeline_subs_paths, + additional_tokens=self.pipeline_details.__dict__ + ) + self.logger.debug("Loaded substitution config: %s", self.substitution_manager._substitutions_config) + + # Initialize substitution manager singleton + pipeline_config.initialize_substitution_manager(self.substitution_manager) + + def _init_secrets_manager(self) -> None: + """Initialize the secrets manager.""" + self.logger.info("Initializing Secrets Manager...") + + workspace_env = self.pipeline_details.workspace_env or "" + + # Build framework secrets paths + framework_secrets_config_paths = [ + os.path.join(self.framework_path, FrameworkPaths.CONFIG_PATH, workspace_env + path) + for path in FrameworkPaths.GLOBAL_SECRETS + ] + + # Build pipeline secrets paths + suffixes = utility.get_format_suffixes(self.pipeline_bundle_spec_format, "secrets") + pipeline_secrets_configs_paths = [os.path.join( + self.bundle_path, PipelineBundlePaths.PIPELINE_CONFIGS_PATH, workspace_env + suffix + ) for suffix in suffixes + ] + + secrets_validator_path = os.path.join( + self.framework_path, FrameworkPaths.SECRETS_SCHEMA_PATH + ) + + self.secrets_manager = SecretsManager( + json_validation_schema_path=secrets_validator_path, + framework_secrets_config_paths=framework_secrets_config_paths, + pipeline_secrets_config_paths=pipeline_secrets_configs_paths + ) + + def _init_dataflow_specs(self) -> None: + """Initialize dataflow specifications.""" + self.logger.info("Initializing Dataflow Spec Builder...") + + dataflow_spec_version = self.pipeline_config.get("dataflow_spec_version", None) + dataflow_spec_builder_max_workers = self.pipeline_config.get( + FrameworkSettings.OVERRIDE_MAX_WORKERS_KEY, + self.default_max_workers + ) + + self.dataflow_specs = DataflowSpecBuilder( + bundle_path=self.bundle_path, + framework_path=self.framework_path, + filters=self.dataflow_spec_filters, + secrets_manager=self.secrets_manager, + ignore_validation_errors=self.ignore_validation_errors, + dataflow_spec_version=dataflow_spec_version, + max_workers=dataflow_spec_builder_max_workers, + spec_file_format=self.pipeline_bundle_spec_format + ).build() + + if not self.dataflow_specs: + raise ValueError(f"No dataflow specifications found in: {self.bundle_path}") + + def _setup_operational_metadata(self) -> None: + """Set up operational metadata schema.""" + self.logger.info("Initializing Operational Metadata...") + layer = self.pipeline_details.pipeline_layer + if not layer: + self.logger.info("Layer not set in pipeline, skipping operational metadata...") + self.operational_metadata_schema = None + return + + self.logger.info("Operational Metadata: layer set to %s", layer) + metadata_path = os.path.join(self.framework_path, f"config/operational_metadata_{layer}.json") + self.logger.info("Operational Metadata Path: %s", metadata_path) + metadata_json = utility.get_json_from_file(metadata_path, False) + self.operational_metadata_schema = ( + T.StructType.fromJson(metadata_json) if metadata_json else None + ) + + # Initialize operational metadata schema singleton + pipeline_config.initialize_operational_metadata_schema(self.operational_metadata_schema) + + def _apply_spark_config(self) -> None: + """Apply Spark configuration settings.""" + spark_config = self.pipeline_config.get("spark_config", {}) + if spark_config: + self.logger.info("Initializing Spark Configs...") + for prop, value in spark_config.items(): + self.logger.info("Set Spark Config: %s = %s", prop, value) + self.spark.conf.set(prop, value) + + def _preload_extensions(self) -> None: + """Add shared extension directories to sys.path.""" + # Framework extensions + framework_extensions = os.path.join(self.framework_path, FrameworkPaths.EXTENSIONS_PATH) + if os.path.exists(framework_extensions): + sys.path.insert(0, framework_extensions) + self.logger.info("Added framework extensions to sys.path: %s", framework_extensions) + + # Bundle extensions + bundle_extensions = os.path.join(self.bundle_path, PipelineBundlePaths.EXTENSIONS_PATH) + if os.path.exists(bundle_extensions): + sys.path.insert(0, bundle_extensions) + self.logger.info("Added bundle extensions to sys.path: %s", bundle_extensions) + + def initialize_pipeline(self) -> None: + """Initialize the Spark Declarative Pipeline.""" + def create_dataflow(spec): + """Create a dataflow from a specification.""" + return DataFlow(dataflow_spec=spec).create_dataflow() + + self.logger.info("Initializing Pipeline...") + pipeline_builder_threading_disabled = self.pipeline_config.get( + FrameworkSettings.PIPELINE_BUILDER_DISABLE_THREADING_KEY, + True + ) + + self.logger.info("Processing Dataflow Specs...") + if pipeline_builder_threading_disabled: + self.logger.info("Pipeline Builder Threading Disabled, creating dataflows sequentially...") + for spec in self.dataflow_specs: + create_dataflow(spec) + else: + pipeline_builder_max_workers = self.pipeline_config.get( + FrameworkSettings.OVERRIDE_MAX_WORKERS_KEY, + self.default_max_workers + ) + + with ThreadPoolExecutor(max_workers=pipeline_builder_max_workers) as executor: + self.logger.info("Pipeline Builder Threading Enabled. Max Workers: %s", pipeline_builder_max_workers) + futures = [ + executor.submit(create_dataflow, spec) + for spec in self.dataflow_specs + ] + for future in futures: + future.result() diff --git a/src/pipeline_config.py b/src/pipeline_config.py new file mode 100644 index 0000000..ac7a52a --- /dev/null +++ b/src/pipeline_config.py @@ -0,0 +1,139 @@ +import os +import json +import logging +from typing import Dict, Any, Optional + +from pyspark.dbutils import DBUtils +import pyspark.sql.types as T +from pyspark.sql import SparkSession + +from constants import DLTPipelineSettingKeys +from pipeline_details import PipelineDetails +from substitution_manager import SubstitutionManager + +# Module-level singletons +_spark = None +_dbutils = None +_logger = None +_substitution_manager = None +_pipeline_details = None +_mandatory_table_properties = None +_operational_metadata_schema = None + + +def initialize_core( + spark: SparkSession, + dbutils: DBUtils, + logger: logging.Logger +) -> None: + """Initialize the pipeline configuration.""" + global _spark, _dbutils, _logger + _spark = spark + _dbutils = dbutils + _logger = logger + + +def initialize_substitution_manager( + substitution_manager: SubstitutionManager +) -> None: + """Initialize the substitution manager.""" + global _substitution_manager + _substitution_manager = substitution_manager + + +def initialize_pipeline_details( + pipeline_details: PipelineDetails +) -> None: + """Initialize the pipeline details.""" + global _pipeline_details + _pipeline_details = pipeline_details + + +def initialize_mandatory_table_properties( + mandatory_table_properties: Dict[str, Any] +) -> None: + """Initialize the mandatory table properties.""" + global _mandatory_table_properties + _mandatory_table_properties = mandatory_table_properties + +def initialize_mandatory_configuration() -> None: + """Initialize the mandatory configuration.""" + verion_file = os.path.join( + os.path.dirname(_spark.conf.get(DLTPipelineSettingKeys.FRAMEWORK_SOURCE_PATH, ".")), + 'VERSION' + ) + with open(verion_file, mode='r', encoding='utf-8') as f: + version = f.read().strip() + + mandatory_configuration = { + 'version': version + } + _spark.conf.set('tag.lakeflow_framework', json.dumps(mandatory_configuration)) + +def initialize_operational_metadata_schema( + operational_metadata_schema: Optional[T.StructType] = None +) -> None: + """Initialize the operational metadata schema.""" + global _operational_metadata_schema + _operational_metadata_schema = operational_metadata_schema + +def initialize_table_migration( + table_migration_state_volume_path: str +) -> None: + """Initialize the table migration state volume path.""" + global _table_migration_state_volume_path + _table_migration_state_volume_path = table_migration_state_volume_path + + +def get_spark() -> SparkSession: + """Get the Spark instance.""" + if _spark is None: + raise RuntimeError("Spark has not been initialized. Call initialize_pipeline_config() first.") + return _spark + + +def get_dbutils() -> DBUtils: + """Get the DBUtils instance.""" + if _dbutils is None: + raise RuntimeError("DBUtils has not been initialized. Call initialize_pipeline_config() first.") + return _dbutils + + +def get_logger() -> logging.Logger: + """Get the logger instance.""" + if _logger is None: + raise RuntimeError("Logger has not been initialized. Call initialize_pipeline_config() first.") + return _logger + + +def get_substitution_manager() -> SubstitutionManager: + """Get the substitution manager instance.""" + if _substitution_manager is None: + raise RuntimeError("Substitution manager has not been initialized. Call initialize_pipeline_config() first.") + return _substitution_manager + + +def get_pipeline_details() -> PipelineDetails: + """Get the pipeline details instance.""" + if _pipeline_details is None: + raise RuntimeError("Pipeline details has not been initialized. Call initialize_pipeline_config() first.") + return _pipeline_details + + +def get_mandatory_table_properties() -> Dict[str, Any]: + """Get the mandatory table properties.""" + if _mandatory_table_properties is None: + raise RuntimeError("Mandatory table properties have not been initialized. Call initialize_pipeline_config() first.") + return _mandatory_table_properties + + +def get_operational_metadata_schema() -> Optional[T.StructType]: + """Get the operational metadata schema.""" + return _operational_metadata_schema + + +def get_table_migration_state_volume_path() -> str: + """Get the table migration state volume path.""" + if _table_migration_state_volume_path is None: + raise RuntimeError("Table migration state volume path has not been initialized. Call initialize_pipeline_config() first.") + return _table_migration_state_volume_path \ No newline at end of file diff --git a/src/pipeline_details.py b/src/pipeline_details.py new file mode 100644 index 0000000..a517a58 --- /dev/null +++ b/src/pipeline_details.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class PipelineDetails: + """Container for pipeline configuration details.""" + pipeline_id: Optional[str] + pipeline_catalog: Optional[str] + pipeline_schema: Optional[str] + pipeline_layer: Optional[str] + start_utc_timestamp: str + workspace_env: Optional[str] + logical_env: str \ No newline at end of file diff --git a/src/schemas/definitions_flows.json b/src/schemas/definitions_flows.json new file mode 100644 index 0000000..610d277 --- /dev/null +++ b/src/schemas/definitions_flows.json @@ -0,0 +1,68 @@ +{ + "title": "Source DataflowSpec Definitions", + "flowType": { + "oneOf": [ + { + "properties": { + "flowType": { "const": "append_sql" }, + "flowDetails": {"$ref": "#/flow/appendSqlFlowDetails"} + } + }, + { + "properties": { + "flowType": { "const": "append_view" }, + "flowDetails": {"$ref": "#/flow/appendViewFlowDetails"} + } + }, + { + "properties": { + "flowType": { "const": "merge" }, + "flowDetails": {"$ref": "#/flow/mergeFlowDetails"} + } + } + ] + }, + "flow": { + "appendSqlFlowDetails": { + "oneOf": [ + { + "properties": { + "targetTable": {"type": "string"}, + "sqlPath": {"type": "string"}, + "once": {"type": "boolean"} + }, + "additionalProperties": false, + "required": ["targetTable", "sqlPath"] + }, + { + "properties": { + "targetTable": {"type": "string"}, + "sqlStatement": {"type": "string"}, + "once": {"type": "boolean"} + }, + "additionalProperties": false, + "required": ["targetTable", "sqlStatement"] + } + ] + }, + "appendViewFlowDetails": { + "properties": { + "targetTable": {"type": "string"}, + "sourceView": {"type": "string"}, + "column_prefix": {"type": "string"}, + "column_prefix_exceptions": {"type": "array"}, + "once": {"type": "boolean"} + }, + "additionalProperties": false, + "required": ["targetTable", "sourceView"] + }, + "mergeFlowDetails": { + "properties": { + "targetTable": {"type": "string"}, + "sourceView": {"type": "string"} + }, + "additionalProperties": false, + "required": ["targetTable", "sourceView"] + } + } +} \ No newline at end of file diff --git a/src/schemas/definitions_main.json b/src/schemas/definitions_main.json new file mode 100644 index 0000000..9f89200 --- /dev/null +++ b/src/schemas/definitions_main.json @@ -0,0 +1,176 @@ +{ + "title": "Standard DataflowSpec Definitions", + "definitions": { + "cdcSettings": { + "type": "object", + "properties": { + "keys": {"type": "array", "items": {"type": "string"}}, + "sequence_by": {"type": "string"}, + "apply_as_deletes": {"type": "string"}, + "where": {"type": "string"}, + "ignore_null_updates": {"type": "boolean"}, + "except_column_list": {"type": "array", "items": {"type": "string"}}, + "scd_type": {"type": "string"}, + "track_history_column_list": {"type": "array", "items": {"type": "string"}}, + "track_history_except_column_list": {"type": "array", "items": {"type": "string"}} + }, + "additionalProperties": false, + "required": ["keys", "sequence_by", "scd_type"] + }, + "cdcSnapshotSettings": { + "type": "object", + "properties": { + "keys": {"type": "array", "items": {"type": "string"}}, + "scd_type": {"type": "string"}, + "snapshotType": {"type": "string", "enum": ["historical", "periodic"]}, + "sourceType": {"type": "string", "enum": ["file", "table"]}, + "source": {}, + "track_history_column_list": {"type": "array", "items": {"type": "string"}}, + "track_history_except_column_list": {"type": "array", "items": {"type": "string"}} + }, + "required": ["keys", "scd_type", "snapshotType"], + "additionalProperties": false, + "allOf": [ + { + "if": { + "allOf": [ + {"properties": {"snapshotType": {"const": "historical"}}}, + {"properties": {"sourceType": {"const": "file"}}} + ] + }, + "then": { + "properties": { + "source": {"$ref": "#/definitions/historicalSnapshotFileSource"} + }, + "required": ["sourceType", "source"] + } + }, + { + "if": { + "allOf": [ + {"properties": {"snapshotType": {"const": "historical"}}}, + {"properties": {"sourceType": {"const": "table"}}} + ] + }, + "then": { + "properties": { + "source": {"$ref": "#/definitions/historicalSnapshotTableSource"} + }, + "required": ["sourceType", "source"] + } + }, + { + "if": { + "properties": {"snapshotType": {"const": "periodic"}} + }, + "then": { + "not": { + "anyOf": [ + {"required": ["sourceType"]}, + {"required": ["source"]} + ] + } + } + } + ] + }, + "historicalSnapshotFileSource": { + "type": "object", + "properties": { + "format": {"type": "string"}, + "path": {"type": "string"}, + "readerOptions": {"type": "object"}, + "filter": {"type": "string"}, + "versionType": {"type": "string", "enum": ["integer", "timestamp"]}, + "startingVersion": {"type": ["string", "integer"]}, + "datetimeFormat": {"type": "string"}, + "microSecondMaskLength": {"type": "integer", "minimum": 0, "maximum": 6}, + "schemaPath": {"type": "string"}, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "recursiveFileLookup": {"type": "boolean", "default": false} + }, + "required": ["format", "path", "versionType"], + "additionalProperties": false, + "allOf": [ + { + "if": { + "properties": { + "versionType": {"const": "integer"} + } + }, + "then": { + "properties": { + "startingVersion": {"type": "integer"} + }, + "not": { + "required": ["datetimeFormat"] + } + } + }, + { + "if": { + "properties": { + "versionType": {"const": "timestamp"} + } + }, + "then": { + "properties": { + "startingVersion": {"type": "string"} + }, + "required": ["datetimeFormat"] + } + } + ] + }, + "historicalSnapshotTableSource": { + "type": "object", + "properties": { + "table": {"type": "string"}, + "versionColumn": {"type": "string"}, + "versionType": {"type": "string", "enum": ["date", "timestamp", "integer", "long"]}, + "startingVersion": {"type": ["string", "integer"]}, + "selectExp": {"type": "array", "items": {"type": "string"}} + }, + "required": ["table", "versionColumn", "versionType"], + "additionalProperties": false + }, + "tableMigrationDetails": { + "type": "object", + "properties": { + "enabled": {"type": "boolean"}, + "autoStartingVersionsEnabled": {"type": "boolean", "default": true}, + "catalogType": {"type": "string", "enum": ["hms", "uc"]}, + "sourceDetails": {"$ref": "./definitions_sources.json#/source/sourceMigrateDelta"} + }, + "required": ["enabled", "catalogType", "sourceDetails"], + "additionalProperties": false + }, + "quarantineTargetDetails": { + "type": "object", + "properties": { + "targetFormat": {"type": "string", "enum": ["delta"], "default": "delta"} + }, + "required": ["targetFormat"], + "dependentSchemas": { + "targetFormat": { + "oneOf": [ + { + "properties": { + "targetFormat": { "const": "delta" }, + "table": {"type": "string"}, + "database": {"type": "string"}, + "tableProperties": {"type":"object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByAuto": {"type": "boolean"} + + }, + "additionalProperties": false + } + ] + } + } + } + } +} diff --git a/src/schemas/definitions_sources.json b/src/schemas/definitions_sources.json new file mode 100644 index 0000000..8e46d66 --- /dev/null +++ b/src/schemas/definitions_sources.json @@ -0,0 +1,363 @@ +{ + "title": "Source DataflowSpec Definitions", + "sourceType": { + "oneOf": [ + { + "properties": { + "sourceType": { "const": "batchFiles" }, + "sourceDetails": { "$ref": "./definitions_sources.json#/source/sourceBatchFiles" } + }, + "required": ["sourceDetails"] + }, + { + "properties": { + "sourceType": { "const": "cloudFiles" }, + "sourceDetails": { "$ref": "./definitions_sources.json#/source/sourceCloudFiles" } + }, + "required": ["sourceDetails"] + }, + { + "properties": { + "sourceType": { "const": "delta" }, + "sourceDetails": { "$ref": "./definitions_sources.json#/source/sourceDelta" } + }, + "required": ["sourceDetails"] + }, + { + "properties": { + "sourceType": { "const": "deltaJoin" }, + "sourceDetails": { "$ref": "./definitions_sources.json#/source/sourceDeltaJoin" } + }, + "required": ["sourceDetails"] + }, + { + "properties": { + "sourceType": { "const": "kafka" }, + "sourceDetails": { "$ref": "./definitions_sources.json#/source/sourceKafkaReader" } + }, + "required": ["sourceDetails"] + }, + { + "properties": { + "sourceType": { "const": "python" }, + "sourceDetails": { "$ref": "./definitions_sources.json#/source/sourcePython" } + }, + "required": ["sourceDetails"] + }, + { + "properties": { + "sourceType": { "const": "sql" }, + "sourceDetails": { "$ref": "./definitions_sources.json#/source/sourceSql" } + }, + "required": ["sourceDetails"] + } + ] + }, + "source": { + "sourceBatchFiles": { + "properties": { + "format": {"type": "string", "enum": ["csv", "json", "parquet", "text", "xml"]}, + "path": {"type": "string"}, + "readerOptions": { + "type": "object", + "properties": { + "header": {"type": "string", "enum": ["true", "false"], "default": "true"}, + "inferSchema": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "mode": {"type": "string", "enum": ["PERMISSIVE", "DROPMALFORMED", "FAILFAST"], "default": "PERMISSIVE"}, + "multiLine": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "rescuedDataColumn": {"type": "string", "default": "_rescued_data"} + }, + "additionalProperties": true + }, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "whereClause": {"type": "array", "items": {"type": "string"}}, + "schemaPath": {"type": "string", "pattern": "\\.json$"}, + "pythonTransform": {"$ref": "./definitions_sources.json#/$defs/pythonTransform"} + }, + "required": ["path", "readerOptions"], + "additionalProperties": false + }, + "sourceCloudFiles": { + "properties": { + "path": {"type": "string"}, + "readerOptions": { + "type": "object", + "properties": { + "cloudFiles.format": {"type": "string","enum": ["avro", "binaryFile", "csv", "json", "orc", "parquet", "text", "xml"]}, + "cloudFiles.allowOverwrites": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "cloudFiles.backfillInterval": {"type": "string"}, + "cloudFiles.includeExistingFiles": {"type": "string", "enum": ["true", "false"], "default": "true"}, + "cloudFiles.inferColumnTypes": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "cloudFiles.maxBytesPerTrigger": {"type": "string"}, + "cloudFiles.maxFileAge": {"type": "string"}, + "cloudFiles.maxFilesPerTrigger": {"type": "string", "default": "1000"}, + "cloudFiles.partitionColumns": {"type": "string"}, + "cloudFiles.schemaEvolutionMode": {"type": "string", "enum": ["addNewColumns", "rescue", "failOnNewColumns"]}, + "cloudFiles.schemaHints": {"type": "string"}, + "cloudFiles.schemaLocation": {"type": "string"}, + "cloudFiles.useStrictGlobber": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "cloudFiles.validateOptions": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "ignoreCorruptFiles": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "ignoreMissingFiles": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "modifiedAfter": {"type": "string", "pattern": "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6} UTC\\+\\d{1,2}$"}, + "modifiedBefore": {"type": "string", "pattern": "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{6} UTC\\+\\d{1,2}$"}, + "pathGlobFilter": {"type": "string"}, + "recursiveFileLookup": {"type": "string", "enum": ["true", "false"], "default": "false"} + }, + "allOf": [ + { + "if": { + "properties": { + "cloudFiles.format": { "const": "json" } + } + }, + "then": { + "properties": { + "mode": {"type": "string", "enum": ["PERMISSIVE", "DROPMALFORMED", "FAILFAST"], "default": "PERMISSIVE"}, + "multiLine": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "rescuedDataColumn": {"type": "string", "default": "_rescued_data"} + } + } + }, + { + "if": { + "properties": { + "cloudFiles.format": { "const": "csv" } + } + }, + "then": { + "properties": { + "columnNameOfCorruptRecord": {"type": "string", "default": "_corrupt_record"}, + "header": {"type": "string", "enum": ["true", "false"], "default": "true"}, + "inferSchema": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "mode": {"type": "string", "enum": ["PERMISSIVE", "DROPMALFORMED", "FAILFAST"], "default": "PERMISSIVE"}, + "multiLine": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "rescuedDataColumn": {"type": "string", "default": "_rescued_data"}, + "quote": {"type": "string", "default": "\""}, + "sep": {"type": "string", "default": ","} + } + } + }, + { + "if": { + "properties": { + "cloudFiles.format": { "const": "text" } + } + }, + "then": { + "properties": { + "encoding": {"type": "string", "default": "UTF-8"}, + "lineSep": {"type": "string"}, + "wholeText": {"type": "string", "enum": ["true", "false"], "default": "false"} + } + } + }, + { + "if": { + "properties": { + "cloudFiles.format": { "const": "xml" } + } + }, + "then": { + "properties": { + "attributePrefix": {"type": "string"}, + "arrayElementName": {"type": "string"}, + "columnNameOfCorruptRecord": {"type": "string", "default": "_corrupt_record"}, + "ignoreNamespace": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "inferSchema": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "mode": {"type": "string", "enum": ["PERMISSIVE", "DROPMALFORMED", "FAILFAST"], "default": "PERMISSIVE"}, + "multiLine": {"type": "string", "enum": ["true", "false"], "default": "false"}, + "rescuedDataColumn": {"type": "string", "default": "_rescued_data"}, + "rootTag": {"type": "string"}, + "rowTag": {"type": "string"}, + "rowValidationXSDPath": {"type": "string"}, + "valueTag": {"type": "string"} + } + } + } + ], + "additionalProperties": true, + "required": ["cloudFiles.format"] + }, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "whereClause": {"type": "array", "items": {"type": "string"}}, + "schemaPath": {"type": "string", "pattern": "\\.json$"}, + "pythonTransform": {"$ref": "./definitions_sources.json#/$defs/pythonTransform"} + }, + "required": ["path", "readerOptions"], + "additionalProperties": false + }, + "sourceDelta": { + "properties": { + "database": {"type": "string"}, + "table": {"type": "string"}, + "cdfEnabled": {"type": "boolean"}, + "tablePath": {"type": "string"}, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "whereClause": {"type": "array", "items": {"type": "string"}}, + "schemaPath": {"type": "string", "pattern": "\\.json$"}, + "readerOptions": {"type": "object", "additionalProperties": true}, + "cdfChangeTypeOverride": {"type": "array", "items": {"type": "string", "enum": ["insert", "update_postimage", "delete"]}}, + "startingVersionFromDLTSetup": {"type": "boolean"}, + "pythonTransform": {"$ref": "./definitions_sources.json#/$defs/pythonTransform"} + }, + "required": ["database", "table", "cdfEnabled"], + "allOf": [ + { + "if": { + "properties": { "startingVersionFromDLTSetup": { "const": true } }, + "required": ["startingVersionFromDLTSetup"] + }, + "then": { + "properties": { "cdfEnabled": { "const": true } } + } + } + ], + "additionalProperties": false + }, + "sourceDeltaJoin": { + "properties": { + "sources": { + "type": "array", + "items": { + "properties": { + "database": {"type": "string"}, + "table": {"type": "string"}, + "alias": {"type": "string"}, + "joinMode": {"type": "string", "enum": ["stream", "static"], "default": "stream"}, + "cdfEnabled": {"type": "boolean"}, + "tablePath": {"type": "string"}, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "whereClause": {"type": "array", "items": {"type": "string"}}, + "schemaPath": {"type": "string", "pattern": "\\.json$"}, + "readerOptions": {"type": "object", "additionalProperties": true}, + "pythonTransform": {"$ref": "./definitions_sources.json#/$defs/pythonTransform"} + }, + "required": ["database", "table", "alias", "cdfEnabled", "joinMode"], + "additionalProperties": false + } + }, + "joins": { + "type": "array", + "items": { + "type": "object", + "properties": { + "joinType": {"type": "string", "enum": ["left", "inner"], "default": "left"}, + "condition": {"type": "string"} + }, + "required": ["joinType", "condition"], + "additionalProperties": false + } + }, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "whereClause": {"type": "array", "items": {"type": "string"}} + }, + "required": ["sources", "joins"], + "additionalProperties": false + }, + "sourceKafkaReader": { + "properties": { + "readerOptions": { + "type": "object", + "properties": { + "endingOffsets": {"type": "string"}, + "failOnDataLoss": {"type": "string"}, + "kafka.bootstrap.servers": {"type": "string"}, + "kafka.group.id": {"type": "string"}, + "kafka.security.protocol": {"type": "string", "default": "SASL_SSL"}, + "kafka.sasl.mechanism": {"type": "string", "default": "PLAIN"}, + "kafka.ssl.truststore.location": {"type": "string"}, + "kafka.ssl.truststore.password": {"type": "string"}, + "kafka.ssl.keystore.location": {"type": "string"}, + "kafka.ssl.keystore.password": {"type": "string"}, + "minPartitions": {"type": "string"}, + "startingOffsets": {"type": "string"}, + "subscribe": {"type": "string", "default": "latest"} + }, + "additionalProperties": true + }, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "whereClause": {"type": "array", "items": {"type": "string"}}, + "schemaPath": {"type": "string", "pattern": "\\.json$"}, + "pythonTransform": {"$ref": "./definitions_sources.json#/$defs/pythonTransform"} + }, + "required": ["readerOptions"], + "additionalProperties": false + }, + "sourceMigrateDelta": { + "properties": { + "database": {"type": "string"}, + "table": {"type": "string"}, + "selectExp": {"type": "array", "items": {"type": "string"}}, + "whereClause": {"type": "array", "items": {"type": "string"}}, + "exceptColumns": {"type": "array", "items": {"type": "string"}} + }, + "required": ["database", "table"], + "additionalProperties": false + }, + "sourcePython": { + "properties": { + "tokens": {"type": "object"}, + "functionPath": {"type": "string", "pattern": "\\.py$"}, + "pythonModule": {"type": "string", "pattern": "^[a-zA-Z_][a-zA-Z0-9_.]*\\.[a-zA-Z_][a-zA-Z0-9_]*$"} + }, + "oneOf": [ + {"required": ["functionPath"]}, + {"required": ["pythonModule"]} + ], + "additionalProperties": false + }, + "sourceSql": { + "oneOf": [ + { + "properties": { + "sqlPath": {"type": "string"} + }, + "required": ["sqlPath"], + "additionalProperties": false + }, + { + "properties": { + "sqlStatement": {"type": "string"} + }, + "required": ["sqlStatement"], + "additionalProperties": false + } + ] + }, + "kafkaConfig": { + "type": "object", + "properties": { + "kafkaSchemaPath": {"type": "string"}, + "kafkaSubject": {"type": "string"}, + "schemaRegistryAddress": {"type": "string"}, + "schemaRegistryOptions": { + "type": "object", + "properties": { + "avroSchemaEvolutionMode": {"type": "string","enum": ["restart","none"], "default": "none"}, + "mode": {"type": "string","enum": ["FAILFAST","PERMISSIVE" ], "default": "FAILFAST"}, + "certPath" : {"type": "string"}, + "keyPath" : {"type": "string"} + }, + "additionalProperties": true + } + }, + "additionalProperties": false + } + }, + "$defs": { + "pythonTransform": { + "type": "object", + "properties": { + "functionPath": {"type": "string", "pattern": "\\.py$"}, + "module": {"type": "string", "pattern": "^[a-zA-Z_][a-zA-Z0-9_.]*\\.[a-zA-Z_][a-zA-Z0-9_]*$"}, + "tokens": {"type": "object", "additionalProperties": true} + }, + "oneOf": [ + {"required": ["functionPath"]}, + {"required": ["module"]} + ], + "additionalProperties": false + } + } +} \ No newline at end of file diff --git a/src/schemas/definitions_targets.json b/src/schemas/definitions_targets.json new file mode 100644 index 0000000..2412453 --- /dev/null +++ b/src/schemas/definitions_targets.json @@ -0,0 +1,219 @@ +{ + "title": "Target DataflowSpec Definitions", + "targetFormat": { + "oneOf": [ + { + "properties": { + "targetFormat": { "const": "delta" }, + "targetDetails": { "$ref": "./definitions_targets.json#/target/targetDelta" } + } + }, + { + "properties": { + "targetFormat": { "const": "delta_sink" }, + "targetDetails": { "$ref": "./definitions_targets.json#/target/targetDeltaSink" } + } + }, + { + "properties": { + "targetFormat": { "const": "custom_python_sink" }, + "targetDetails": { "$ref": "./definitions_targets.json#/target/targetCustomPythonSink" } + } + }, + { + "properties": { + "targetFormat": { "const": "kafka_sink" }, + "targetDetails": { "$ref": "./definitions_targets.json#/target/targetKafkaSink" } + } + }, + { + "properties": { + "targetFormat": { "const": "foreach_batch_sink" }, + "targetDetails": { "$ref": "./definitions_targets.json#/target/targetForEachBatchSink" } + } + } + ] + }, + "target": { + "targetDelta": { + "type": "object", + "properties": { + "database": {"type": "string"}, + "table": {"type": "string"}, + "type": {"type": "string", "enum": ["ST", "MV"]}, + "schemaPath": {"type": "string", "pattern": "\\.(json|ddl)$"}, + "tableProperties": {"type": "object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByAuto": {"type": "boolean"}, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"}, + "comment": {"type": "string"}, + "sparkConf": {"type": "object"}, + "rowFilter": {"type": "string"} + }, + "dependentSchemas": { + "type": { + "oneOf": [ + { + "properties": { + "type": { "const": "ST" } + }, + "additionalProperties": false + }, + { + "allOf": [ + {"properties": {"type": {"const": "MV"}}}, + { + "anyOf": [ + {"required": ["sourceView"]}, + {"required": ["sqlPath"]}, + {"required": ["sqlStatement"]} + ] + } + ], + "additionalProperties": false + } + ] + } + }, + "required": ["table"], + "oneOf": [ + { + "not": { + "required": ["clusterByColumns", "partitionColumns"] + } + } + ] + }, + "targetCustomPythonSink": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sinkOptions": { + "type": "object", + "additionalProperties": true + }, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "required": ["name", "sinkOptions"], + "additionalProperties": false + }, + "targetDeltaSink": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sinkOptions": { + "type": "object", + "properties": { + "tableName": {"type": "string"}, + "path": {"type": "string"} + }, + "additionalProperties": true, + "oneOf": [ + { + "required": ["tableName"], + "not": { + "required": ["path"] + } + }, + { + "required": ["path"], + "not": { + "required": ["tableName"] + } + } + ] + }, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "required": ["name", "sinkOptions"], + "additionalProperties": false + }, + "targetKafkaSink": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "sinkOptions": { + "type": "object", + "properties": { + "topic": {"type": "string"}, + "kafka.bootstrap.servers": {"type": "string"}, + "kafka.group.id": {"type": "string"}, + "kafka.security.protocol": {"type": "string", "default": "SASL_SSL"}, + "kafka.ssl.keystore.location": {"type": "string"}, + "kafka.ssl.keystore.password": {"type": "string"}, + "kafka.ssl.truststore.location": {"type": "string"}, + "kafka.ssl.truststore.password": {"type": "string"} + }, + "required": ["topic", "kafka.bootstrap.servers"] + }, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "required": ["name", "sinkOptions"], + "additionalProperties": false + }, + "targetForEachBatchSink": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "type": {"type": "string", "enum": ["basic_sql", "python_function"]}, + "config": {"type": "object"}, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "additionalProperties": false, + "required": ["name", "type", "config"], + "dependentSchemas": { + "type": { + "oneOf": [ + { + "properties": { + "type": { "const": "basic_sql" }, + "target": { + "type": "object", + "properties": { + "database": {"type": "string"}, + "table": {"type": "string"}, + "tableProperties": {"type": "object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "sqlPath": {"type": "string"}, + "sqlStatement": {"type": "string"} + } + } + } + }, + { + "properties": { + "type": { "const": "python_function" }, + "target": { + "type": "object", + "properties": { + "tokens": {"type": "object"}, + "module": {"type": "string", "pattern": "^[a-zA-Z_][a-zA-Z0-9_.]*\\.[a-zA-Z_][a-zA-Z0-9_]*$"}, + "functionPath": {"type": "string", "pattern": "\\.py$"} + }, + "oneOf": [ + {"required": ["functionPath"]}, + {"required": ["module"]} + ] + } + } + } + ] + } + } + } + }, + "$defs": { + "configFlags": { + "type": "array", + "items": { + "type": "string", + "enum": ["disableOperationalMetadata"] + }, + "default": [] + } + } +} diff --git a/src/schemas/expectations.json b/src/schemas/expectations.json new file mode 100644 index 0000000..8aaaeba --- /dev/null +++ b/src/schemas/expectations.json @@ -0,0 +1,24 @@ +{ + "title": "Expectations", + "type": "object", + "properties": { + "expect": {"$ref": "#/definitions/expectations"}, + "expect_or_drop": {"$ref": "#/definitions/expectations"}, + "expect_or_fail": {"$ref": "#/definitions/expectations"} + }, + "definitions": { + "expectations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "constraint": {"type": "string"}, + "tag": {"type": "string"}, + "enabled": {"type": "boolean"} + }, + "required": ["name", "constraint"] + } + } + } +} \ No newline at end of file diff --git a/src/schemas/flow_group.json b/src/schemas/flow_group.json new file mode 100644 index 0000000..738c09d --- /dev/null +++ b/src/schemas/flow_group.json @@ -0,0 +1,140 @@ +{ + "title": "Flow DataFlowSpec FlowGroup", + "type": "object", + "properties": { + "dataFlowId": {"type": "string"}, + "flowGroupId": {"type": "string"}, + "stagingTables": {"$ref": "#/definitions/stagingTables"}, + "flows": { + "type": "object", + "patternProperties": { + "f_([A-Za-z0-9_]+)": { + "type": "object", + "properties": { + "enabled": {"type": "boolean"}, + "flowType": {"type": "string", "enum": ["append_view", "append_sql", "merge"]}, + "flowDetails": {"type": "object"}, + "views": {"$ref": "#/definitions/views"} + }, + "required": ["flowType", "flowDetails"], + "dependentSchemas": { + "flowType": { "$ref": "./definitions_flows.json#/flowType"} + } + } + }, + "additionalProperties": false + } + }, + "required": ["flowGroupId", "flows"], + "additionalProperties": false, + "definitions": { + "stagingTables": { + "additionalProperties": { + "type": "object", + "properties": { + "type": {"type": "string", "enum": ["ST", "MV"]}, + "database": {"type": "string"} + }, + "dependentSchemas": { + "type": { + "oneOf": [ + { + "properties": { + "type": {"const": "ST"}, + "database": {"type": "string"}, + "schemaPath": {"type": "string", "pattern": "\\.(json|ddl)$"}, + "tableProperties": {"type": "object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByAuto": {"type": "boolean"}, + "cdcSettings": {"$ref": "./definitions_main.json#/definitions/cdcSettings"}, + "cdcSnapshotSettings": {"$ref": "./definitions_main.json#/definitions/cdcSnapshotSettings"}, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "additionalProperties": false + }, + { + "properties": { + "type": {"const": "MV"}, + "database": {"type": "string"}, + "schemaPath": {"type": "string", "pattern": "\\.(json|ddl)$"}, + "tableProperties": {"type": "object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByAuto": {"type": "boolean"}, + "sourceView": {"type": "string"}, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "additionalProperties": false, + "required": ["sourceView"] + }, + { + "properties": { + "type": {"const": "MV"}, + "database": {"type": "string"}, + "schemaPath": {"type": "string", "pattern": "\\.(json|ddl)$"}, + "tableProperties": {"type": "object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByAuto": {"type": "boolean"}, + "sqlPath": {"type": "string"}, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "additionalProperties": false, + "required": ["sqlPath"] + }, + { + "properties": { + "type": {"const": "MV"}, + "database": {"type": "string"}, + "schemaPath": {"type": "string", "pattern": "\\.(json|ddl)$"}, + "tableProperties": {"type": "object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByAuto": {"type": "boolean"}, + "sqlStatement": {"type": "string"}, + "configFlags": {"$ref": "./definitions_targets.json#/$defs/configFlags"} + }, + "additionalProperties": false, + "required": ["sqlStatement"] + } + ] + } + }, + "required": ["type"] + } + }, + "views": { + "type": "object", + "patternProperties": { + "v_([A-Za-z0-9_]+)": { + "type": "object", + "properties": { + "mode": {"type": "string", "enum": ["batch", "stream"]}, + "sourceType": {"type": "string", "enum": ["batchFiles", "cloudFiles", "delta", "deltaJoin", "kafka", "python", "sql"]}, + "sourceDetails": {"type": "object"} + }, + "dependentSchemas": { + "sourceType": { "$ref": "./definitions_sources.json#/sourceType"} + }, + "required": ["mode", "sourceType", "sourceDetails"] + } + }, + "additionalProperties": false + } + }, + "$defs": { + "configFlags": { + "type": "array", + "items": { + "type": "string", + "enum": ["disableOperationalMetadata"] + }, + "default": [] + } + } +} \ No newline at end of file diff --git a/src/schemas/main.json b/src/schemas/main.json new file mode 100644 index 0000000..1d34ce6 --- /dev/null +++ b/src/schemas/main.json @@ -0,0 +1,40 @@ +{ + "title": "Main DataFlowSpec", + "type": "object", + "properties": { + "dataFlowId": {"type": "string"}, + "dataFlowGroup": {"type": "string"}, + "dataFlowType": {"type": "string", "enum": ["flow", "standard", "materialized_view"]}, + "dataFlowVersion": {"type": "string"}, + "tags": {"type": "object", "additionalProperties": true}, + "features": { + "type": "object", + "properties": { + "operationalMetadataEnabled": {"type": "boolean"} + } + } + }, + "if": { + "properties": { + "dataFlowType": { "const": "standard" } + } + }, + "then": { "$ref": "./spec_standard.json#/$defs/standardSpec" }, + "else": { + "if": { + "properties": { + "dataFlowType": { "const": "flow" } + } + }, + "then": { "$ref": "./spec_flows.json#/$defs/flowsSpec" }, + "else": { + "if": { + "properties": { + "dataFlowType": { "const": "materialized_view" } + } + }, + "then": { "$ref": "./spec_materialized_views.json#/$defs/materializedViewsSpec" } + } + }, + "required": ["dataFlowId", "dataFlowGroup", "dataFlowType"] +} \ No newline at end of file diff --git a/src/schemas/secrets.json b/src/schemas/secrets.json new file mode 100644 index 0000000..1431656 --- /dev/null +++ b/src/schemas/secrets.json @@ -0,0 +1,17 @@ +{ + "title": "Secrets", + "type": "object", + "patternProperties": { + "([A-Za-z0-9_]+)": { + "type": "object", + "properties": { + "scope": {"type": "string"}, + "key": {"type": "string"}, + "exceptionEnabled": {"type": "boolean"} + }, + "required": ["scope", "key"], + "additionalProperties": false + } + }, + "additionalProperties": false +} \ No newline at end of file diff --git a/src/schemas/spec_flows.json b/src/schemas/spec_flows.json new file mode 100644 index 0000000..d7222be --- /dev/null +++ b/src/schemas/spec_flows.json @@ -0,0 +1,58 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://dlt-framework/schemas/spec_flows.json", + "title": "Flows Dataflow Specification", + "$defs": { + "flowsSpec": { + "type": "object", + "properties": { + "dataFlowId": {"type": "string"}, + "dataFlowGroup": {"type": "string"}, + "dataFlowType": {"type": "string", "enum": ["flow", "standard", "materialized_views"]}, + "dataFlowVersion": {"type": "string"}, + "tags": {"type": "object", "additionalProperties": true}, + "features": {"type": "object", "additionalProperties": true}, + "targetFormat": {"type": "string", "enum": ["delta", "delta_sink", "kafka_sink", "foreach_batch_sink"]}, + "targetDetails": {"type": "object"}, + "cdcSettings": {"$ref": "./definitions_main.json#/definitions/cdcSettings"}, + "cdcSnapshotSettings": {"$ref": "./definitions_main.json#/definitions/cdcSnapshotSettings"}, + "dataQualityExpectationsEnabled": {"type": "boolean", "default": false}, + "dataQualityExpectationsPath": {"type": "string"}, + "quarantineMode": {"type": "string", "enum": ["off", "flag", "table"], "default": "off"}, + "quarantineTargetDetails": {"$ref": "./definitions_main.json#/definitions/quarantineTargetDetails"}, + "tableMigrationDetails": {"$ref": "./definitions_main.json#/definitions/tableMigrationDetails"}, + "flowGroups": {"type": "array", "items": {"$ref": "./flow_group.json"}} + }, + "required": ["dataFlowId", "dataFlowGroup", "dataFlowType", "targetFormat", "targetDetails"], + "additionalProperties": false, + "dependentSchemas": { + "targetFormat": { "$ref": "./definitions_targets.json#/targetFormat"} + }, + "not": {"required": ["cdcSettings", "cdcSnapshotSettings"]}, + "allOf": [ + { + "if": { + "properties": { + "dataQualityExpectationsEnabled": { "const": true } + }, + "required": ["dataQualityExpectationsEnabled"] + }, + "then": { + "required": ["dataQualityExpectationsPath"] + } + }, + { + "if": { + "properties": { + "quarantineMode": { "const": "table" } + }, + "required": ["quarantineMode"] + }, + "then": { + "required": ["quarantineTargetDetails"] + } + } + ] + } + } +} \ No newline at end of file diff --git a/src/schemas/spec_mapping.json b/src/schemas/spec_mapping.json new file mode 100644 index 0000000..f95895e --- /dev/null +++ b/src/schemas/spec_mapping.json @@ -0,0 +1,63 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Dataflow Spec Mapping", + "type": "object", + "additionalProperties": {"$ref": "#/$defs/mappingOperations"}, + "$defs": { + "mappingOperations": { + "type": "object", + "properties": { + "rename_all": {"type": "object", "additionalProperties": {"$ref": "#/$defs/operationValue"}}, + "rename_specific": {"type": "object", "additionalProperties": {"$ref": "#/$defs/operationValue"}}, + "move": {"type": "object", "additionalProperties": {"$ref": "#/$defs/operationValue"}}, + "delete": {"type": "object", "additionalProperties": {"$ref": "#/$defs/deleteValue"}} + }, + "additionalProperties": false + }, + "operationValue": { + "oneOf": [ + {"type": "string", "minLength": 1}, + {"$ref": "#/$defs/conditionalOperation"} + ] + }, + "conditionalOperation": { + "type": "object", + "properties": { + "to": {"type": "string", "minLength": 1}, + "condition": {"$ref": "#/$defs/condition"} + }, + "required": ["to", "condition"], + "additionalProperties": false + }, + "condition": { + "type": "object", + "properties": { + "key": {"type": "string", "minLength": 1}, + "operator": {"type": "string", "enum": ["equal_to", "not_equal_to", "in", "not_in"]}, + "value": { + "oneOf": [ + {"type": "string"}, + {"type": "number"}, + {"type": "boolean"}, + {"type": "null"}, + {"type": "array", "items": {"oneOf": [{"type": "string"}, {"type": "number"}, {"type": "boolean"}, {"type": "null"}]}} + ] + } + }, + "required": ["key", "operator", "value"], + "additionalProperties": false + }, + "deleteValue": { + "oneOf": [ + {"type": "boolean"}, + {"type": "string"}, + { + "type": "object", + "properties": {"condition": {"$ref": "#/$defs/condition"}}, + "required": ["condition"], + "additionalProperties": false + } + ] + } + } +} diff --git a/src/schemas/spec_materialized_views.json b/src/schemas/spec_materialized_views.json new file mode 100644 index 0000000..c15a81f --- /dev/null +++ b/src/schemas/spec_materialized_views.json @@ -0,0 +1,74 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://dlt-framework/schemas/spec_materialized_views.json", + "title": "Materialized Views Dataflow Specification", + "$defs": { + "materializedViewsSpec": { + "properties": { + "dataFlowId": {"type": "string"}, + "dataFlowGroup": {"type": "string"}, + "dataFlowType": { "const": "materialized_view" }, + "dataFlowVersion": {"type": "string"}, + "tags": {"type": "object", "additionalProperties": true}, + "materializedViewsPath": {"type": "string"}, + "materializedViews": { + "additionalProperties": { + "type": "object", + "properties": { + "sourceView": { + "type": "object", + "properties": { + "sourceViewName": {"type": "string"}, + "sourceType": {"type": "string", "enum": ["delta", "python", "sql"]}, + "sourceDetails": {"type": "object"} + } + }, + "sqlPath": {"type": "string"}, + "sqlStatement": {"type": "string"}, + "tableDetails": { + "type": "object", + "properties": { + "database": {"type": "string"}, + "schemaPath": {"type": "string", "pattern": "\\.(json|ddl)$"}, + "tableProperties": {"type": "object"}, + "path": {"type": "string"}, + "partitionColumns": {"type": "array", "items": {"type": "string"}}, + "clusterByColumns": {"type": "array", "items": {"type": "string"}}, + "comment": {"type": "string"}, + "sparkConf": {"type": "object"}, + "private": {"type": "boolean"}, + "rowFilter": {"type": "string"} + }, + "additionalProperties": false + }, + "dataQualityExpectationsEnabled": {"type": "boolean"}, + "dataQualityExpectationsPath": {"type": "string"}, + "quarantineMode": {"type": "string", "enum": ["off", "flag", "table"], "default": "off"}, + "quarantineTargetDetails": {"type": "object"} + }, + "additionalProperties": false, + "oneOf": [ + { + "required": ["sourceView"], + "dependentSchemas": { + "sourceView": { + "properties": { + "sourceType": {"$ref": "./definitions_sources.json#/sourceType"} + } + } + } + }, + { + "required": ["sqlPath"] + }, + { + "required": ["sqlStatement"] + } + ] + } + } + }, + "required": ["dataFlowId", "dataFlowGroup", "dataFlowType", "materializedViews"] + } + } +} \ No newline at end of file diff --git a/src/schemas/spec_standard.json b/src/schemas/spec_standard.json new file mode 100644 index 0000000..b784a67 --- /dev/null +++ b/src/schemas/spec_standard.json @@ -0,0 +1,106 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://dlt-framework/schemas/spec_standard.json", + "title": "Standard Dataflow Specification", + "$defs": { + "standardSpec": { + "properties": { + "dataFlowId": {"type": "string"}, + "dataFlowGroup": {"type": "string"}, + "dataFlowType": {"type": "string", "enum": ["standard"]}, + "dataFlowVersion": {"type": "string"}, + "tags": {"type": "object", "additionalProperties": true}, + "features": {"type": "object", "additionalProperties": true}, + "sourceSystem": {"type": "string"}, + "sourceType": {"type": "string", "enum": ["batchFiles", "cloudFiles", "delta", "deltaJoin", "kafka", "python", "sql"]}, + "sourceDetails": {"type": "object"}, + "sourceViewName": {"type": "string","pattern": "v_([A-Za-z0-9_]+)"}, + "mode": {"type": "string", "enum": ["stream","batch"]}, + "targetFormat": {"type": "string", "enum": ["delta", "delta_sink", "kafka_sink", "foreach_batch_sink"]}, + "targetDetails": {"type": "object"}, + "cdcSettings": {"$ref": "./definitions_main.json#/definitions/cdcSettings"}, + "cdcSnapshotSettings": {"$ref": "./definitions_main.json#/definitions/cdcSnapshotSettings"}, + "dataQualityExpectationsEnabled": {"type": "boolean", "default": false}, + "dataQualityExpectationsPath": {"type": "string"}, + "quarantineMode": {"type": "string", "enum": ["off", "flag", "table"], "default": "off"}, + "quarantineTargetDetails": {"$ref": "./definitions_main.json#/definitions/quarantineTargetDetails"}, + "tableMigrationDetails": {"$ref": "./definitions_main.json#/definitions/tableMigrationDetails"} + }, + "required": ["dataFlowId", "dataFlowGroup", "dataFlowType", "targetFormat", "targetDetails"], + "additionalProperties": false, + "dependentSchemas": { + "sourceType": { "$ref": "./definitions_sources.json#/sourceType"}, + "targetFormat": { "$ref": "./definitions_targets.json#/targetFormat"} + }, + "not": {"required": ["cdcSettings", "cdcSnapshotSettings"]}, + "allOf": [ + { + "if": { + "properties": { + "cdcSnapshotSettings": { + "type": "object", + "properties": { + "snapshotType": {"const": "historical"} + }, + "required": ["snapshotType"] + } + }, + "required": ["cdcSnapshotSettings"] + }, + "then": { + "not": { + "anyOf": [ + {"required": ["sourceSystem"]}, + {"required": ["sourceType"]}, + {"required": ["sourceViewName"]}, + {"required": ["mode"]}, + {"required": ["sourceDetails"]} + ] + } + } + }, + { + "if": { + "not": { + "properties": { + "cdcSnapshotSettings": { + "type": "object", + "properties": { + "snapshotType": {"const": "historical"} + }, + "required": ["snapshotType"] + } + }, + "required": ["cdcSnapshotSettings"] + } + }, + "then": { + "required": ["sourceSystem", "sourceType", "sourceViewName", "mode", "sourceDetails"] + } + }, + { + "if": { + "properties": { + "dataQualityExpectationsEnabled": { "const": true } + }, + "required": ["dataQualityExpectationsEnabled"] + }, + "then": { + "required": ["dataQualityExpectationsPath"] + } + }, + { + "if": { + "properties": { + "quarantineMode": { "const": "table" } + }, + "required": ["quarantineMode"] + }, + "then": { + "required": ["quarantineTargetDetails"] + } + } + ] + } + } +} \ No newline at end of file diff --git a/src/schemas/spec_template.json b/src/schemas/spec_template.json new file mode 100644 index 0000000..1cd349e --- /dev/null +++ b/src/schemas/spec_template.json @@ -0,0 +1,25 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://dlt-framework/schemas/template_spec_definition.json", + "title": "Template Dataflow Specification", + "description": "Schema for instantiating a dataflow template with parameters", + "type": "object", + "properties": { + "template": {"type": "string"}, + "parameterSets": { + "type": "array", + "items": { + "type": "object", + "properties": { + "dataFlowId": {"type": "string"} + }, + "required": ["dataFlowId"], + "additionalProperties": true, + "minItems": 2 + }, + "minItems": 1 + } + }, + "required": ["template", "parameterSets"], + "additionalProperties": false +} \ No newline at end of file diff --git a/src/schemas/spec_template_definition.json b/src/schemas/spec_template_definition.json new file mode 100644 index 0000000..f894022 --- /dev/null +++ b/src/schemas/spec_template_definition.json @@ -0,0 +1,29 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://dlt-framework/schemas/template_spec.json", + "title": "Template Definition", + "description": "Schema for dataflow template definitions that can be instantiated with parameters", + "type": "object", + "properties": { + "name": {"type": "string"}, + "parameters": { + "type": "object", + "patternProperties": { + "^[a-zA-Z_][a-zA-Z0-9_]*$": { + "type": "object", + "properties": { + "type": {"type": "string", "enum": ["string", "list", "object", "integer", "boolean"]}, + "required": {"type": "boolean", "default": true}, + "default": {"type": "string"} + }, + "required": ["type"], + "additionalProperties": false + } + }, + "additionalProperties": false + }, + "template": {"type": "object", "minProperties": 1} + }, + "required": ["name", "parameters", "template"], + "additionalProperties": false +} diff --git a/src/secrets_manager.py b/src/secrets_manager.py new file mode 100644 index 0000000..68ed8c6 --- /dev/null +++ b/src/secrets_manager.py @@ -0,0 +1,196 @@ +from dataclasses import dataclass +from typing import Dict, Pattern, Any, List +import re +import os + +from pyspark.dbutils import DBUtils + +import pipeline_config +import utility + + +@dataclass(frozen=True) +class SecretConfig: + """ + Immutable configuration for a secret. + + Attributes: + scope: The secret scope name + key: The secret key name + exceptionEnabled: Whether to raise exceptions on secret retrieval failure + """ + scope: str + key: str + exceptionEnabled: bool = False + + def __post_init__(self): + """Validate the secret configuration.""" + if not self.scope or not isinstance(self.scope, str): + raise ValueError("Secret scope must be a non-empty string") + if not self.key or not isinstance(self.key, str): + raise ValueError("Secret key must be a non-empty string") + + def get_secret(self, dbutils: DBUtils) -> str: + """Get the secret value, retrieving it if not already cached.""" + try: + return SecretValue( + dbutils.secrets.get( + scope=self.scope, + key=self.key + ) + ) + except Exception as e: + if self.exceptionEnabled: + raise RuntimeError( + f"Failed to retrieve secret '{self.key}' from scope '{self.scope}': {str(e)}" + ) from e + return "" + + +class SecretValue: + """ + A wrapper class that lazily retrieves secrets when accessed. + This prevents secrets from being stored in memory and only retrieves them when needed. + """ + def __init__(self, secret: str): + self.__secret = secret + + def __str__(self) -> str: + """Return the secret value when converted to string.""" + return self.__secret + + def __repr__(self) -> str: + """Return a redacted string representation.""" + return "[REDACTED]" + + def __dict__(self) -> Dict[str, str]: + """Prevent conversion to dict from exposing the secret value.""" + return {"secret": "[REDACTED]"} + + def __getstate__(self) -> Dict[str, str]: + """Prevent pickling from exposing the secret value.""" + return {"secret": "[REDACTED]"} + + def clear(self) -> None: + """Clear the cached secret value.""" + self.__secret = None + + +class SecretsManager: + """ + This class provides a centralized way to manage secrets. + + Attributes: + dbutils: DBUtils instance for accessing secrets + logger: Logger instance for logging + framework_secrets_config_path: Path to framework secrets config + pipeline_secrets_config_path: Path to pipeline secrets config + + Example: + manager = SecretsManager(dbutils, "./config/framework_secrets.json", "./config/pipeline_secrets.json") + secrets = manager.get_secrets() # Get all secrets + value = manager.get_secret("db_password") # Get specific secret + """ + + SECRET_PATTERN: Pattern = re.compile(r"^\$\{secret\.([a-zA-Z0-9_]+)\}$") + + def __init__( + self, + json_validation_schema_path: str, + framework_secrets_config_paths: List[str], + pipeline_secrets_config_paths: List[str] + ): + self.dbutils = pipeline_config.get_dbutils() + self.logger = pipeline_config.get_logger() + self.json_validation_schema_path = json_validation_schema_path + self.framework_secrets_config_paths = framework_secrets_config_paths + self.pipeline_secrets_config_paths = pipeline_secrets_config_paths + self.validator = utility.JSONValidator(json_validation_schema_path) + + # Load secret configurations only + self._secret_configs = self._load_secrets() + + def _load_file(self, paths: List[str], config_type: str) -> Dict[str, Any]: + """Load a single secret file from a list of possible paths.""" + existing_files = [path for path in paths if os.path.exists(path)] + + if len(existing_files) > 1: + raise ValueError(f"Multiple {config_type} secrets files found. Only one is allowed: {existing_files}") + + if len(existing_files) == 1: + file_path = existing_files[0] + + self.logger.info("Retrieving %s secrets from: %s", config_type, file_path) + return utility.load_config_file_auto(file_path, False) + + self.logger.warning("No %s secrets file found.", config_type) + return {} + + def _load_secrets_config_from_files(self) -> Dict[str, Any]: + """Load and merge framework and pipeline secret configurations.""" + framework_secrets = self._load_file(self.framework_secrets_config_paths, "framework") + pipeline_secrets = self._load_file(self.pipeline_secrets_config_paths, "pipeline") + + return utility.merge_dicts_recursively(pipeline_secrets, framework_secrets) + + def _load_secrets(self) -> Dict[str, SecretConfig]: + """Load and merge framework and pipeline secret configurations.""" + json_data = self._load_secrets_config_from_files() + errors = self.validator.validate(json_data) + if errors: + raise ValueError(f"Secrets validation errors: {errors}") + + try: + return { + alias: SecretConfig(**config) + for alias, config + in json_data.items() + } + except KeyError as e: + raise ValueError(f"Invalid secret configuration: {e}") from e + except TypeError as e: + raise ValueError(f"Invalid secret configuration: {e}") from e + except Exception as e: + raise e + + def get_secret(self, alias: str) -> SecretValue: + """ + Get a SecretValue object for a secret by its alias. + + Args: + alias: The alias name of the secret + + Returns: + A SecretValue object that will lazily retrieve the secret when accessed + """ + if alias not in self._secret_configs: + raise KeyError(f"Secret alias '{alias}' not found") + + return self._secret_configs[alias].get_secret(self.dbutils) + + def substitute_secrets(self, data: Any) -> Any: + """ + Substitute secret references in a dictionary with SecretValue objects. + + Args: + data: The data to process (dict, list, or any other type) + + Returns: + Processed data with secret references replaced by SecretValue objects + """ + def substitute_value(value: Any) -> Any: + match = self.SECRET_PATTERN.match(value) + if match: + secret_alias = match.group(1) + return self.get_secret(secret_alias) + else: + return value + + if isinstance(data, dict): + return {k: self.substitute_secrets(v) for k, v in data.items()} + elif isinstance(data, list): + return [self.substitute_secrets(item) for item in data] + elif isinstance(data, str): + return substitute_value(data) + else: + return data diff --git a/src/substitution_manager.py b/src/substitution_manager.py new file mode 100644 index 0000000..4afb3e3 --- /dev/null +++ b/src/substitution_manager.py @@ -0,0 +1,186 @@ +import re +import os +from typing import Dict, Any, Optional, Pattern, List +from functools import cached_property + +import utility +import pipeline_config + +class SubstitutionManager(): + """ + Manages token substitutions in strings and nested dictionaries. + + This class handles token replacements in configuration files, supporting both + framework-level and pipeline-level substitutions with optional additional tokens. + + Attributes: + framework_substitutions_path (str): Path to framework substitutions JSON + pipeline_substitutions_path (str): Path to pipeline substitutions JSON + additional_tokens (Optional[Dict[str, str]]): Optional dictionary of additional token replacements + + Methods: + substitute_string(input_string: str) -> str: + Substitute tokens in a string. + + substitute_dict(input_dict: Dict[str, Any]) -> Dict[str, Any]: + Substitute tokens in a nested dictionary. + """ + + # Compiled regex patterns for token substitution + DEFAULT_TOKEN_PATTERN: Pattern = re.compile(r'\{(\w+)\}') + + def __init__( + self, + framework_substitutions_paths: List[str], + pipeline_substitutions_paths: List[str], + additional_tokens: Optional[Dict[str, str]] = None + ): + """Initialize the SubstitutionManager. + + Args: + framework_substitutions_paths: List of paths to framework substitutions files. + pipeline_substitutions_paths: List of paths to pipeline substitutions files. + additional_tokens: Optional dictionary of additional token replacements + """ + if not framework_substitutions_paths or not pipeline_substitutions_paths: + raise ValueError("Framework and pipeline substitution paths must be provided as a list.") + + self.framework_substitutions_paths = framework_substitutions_paths + self.pipeline_substitutions_paths = pipeline_substitutions_paths + self.additional_tokens = additional_tokens or {} + + self.logger = pipeline_config.get_logger() + + self._substitutions_config = self._load_substitution_config() + + def _load_file(self, paths: List[str], config_type: str) -> Dict[str, Any]: + """Load a single substitution file from a list of possible paths.""" + existing_files = [path for path in paths if os.path.exists(path)] + + if len(existing_files) > 1: + raise ValueError(f"Multiple {config_type} substitutions files found. Only one is allowed: {existing_files}") + + if len(existing_files) == 1: + file_path = existing_files[0] + self.logger.info("Retrieving %s substitutions from: %s", config_type, file_path) + return utility.load_config_file_auto(file_path, False) or {} + + self.logger.warning("No %s substitutions file found.", config_type) + return {} + + def _load_substitution_config(self) -> Dict[str, Any]: + """Load and merge framework and pipeline substitutions.""" + framework_subs = self._load_file(self.framework_substitutions_paths, "framework") + pipeline_subs = self._load_file(self.pipeline_substitutions_paths, "pipeline") + + return utility.merge_dicts_recursively(pipeline_subs, framework_subs) + + @cached_property + def tokens(self) -> Dict[str, str]: + """Get merged tokens with additional tokens applied.""" + tokens = self._substitutions_config.get('tokens', {}) + if self.additional_tokens: + tokens = utility.merge_dicts_recursively(tokens, self.additional_tokens) + + # Apply substitutions to token values themselves + return { + k: self._substitute_tokens_in_string(v, tokens) + for k, v in tokens.items() + } + + @cached_property + def prefix_suffix_rules(self) -> Dict[str, Dict[str, str]]: + """Get prefix/suffix rules with tokens substituted.""" + rules = self._substitutions_config.get('prefix_suffix', {}) + + # Apply substitutions to prefix/suffix values + return { + k: { + rule_k: self._substitute_tokens_in_string(rule_v, self.tokens) + for rule_k, rule_v in v.items() + } + for k, v in rules.items() + } + + def substitute_string(self, input_string: str, additional_tokens: Optional[Dict[str, Any]] = None) -> str: + """Substitute tokens in a string. + + Args: + input_string: String containing tokens to replace + additional_tokens: Optional dictionary of additional token replacements. + Dictionary values will be converted to strings, with nested dictionaries being JSON serialized. + + Returns: + str: String with all tokens substituted + """ + if not isinstance(input_string, str): + raise TypeError(f"Expected string input, got {type(input_string)}") + + tokens = self.tokens + if additional_tokens: + tokens = utility.merge_dicts_recursively(self.tokens.copy(), additional_tokens) + + return self._substitute_tokens_in_string(input_string, tokens) + + def substitute_dict(self, data: Any) -> Any: + """ + Substitute tokens and apply prefix/suffix rules to a string or dictionary. + + Args: + data: The data to process (dict, list, or string) + + Returns: + Processed data with tokens references replaced by their values + """ + result = self._substitute_tokens(data) + result = self._apply_prefix_suffix(result) + return result + + def _substitute_tokens_in_string( + self, + value: str, + tokens: Dict[str, str], + pattern: Optional[Pattern] = None + ) -> str: + """Replace tokens in a string using regex substitution.""" + if not isinstance(value, str): + return value + + def replace_token(match): + token = match.group(1) + if token in self.additional_tokens: + return self.additional_tokens[token] + elif token in tokens: + return tokens[token] + return match.group(0) + + return (pattern or self.DEFAULT_TOKEN_PATTERN).sub(replace_token, value) + + def _substitute_tokens(self, data: Any) -> Any: + """Substitute tokens in a string or dictionary""" + if isinstance(data, dict): + return {k: self._substitute_tokens(v) for k, v in data.items()} + elif isinstance(data, list): + return [self._substitute_tokens(item) for item in data] + elif isinstance(data, str): + return self._substitute_tokens_in_string(data, self.tokens, self.DEFAULT_TOKEN_PATTERN) + else: + return data + + def _apply_prefix_suffix(self, data: Any) -> Any: + """Apply prefix/suffix rules to a string or dictionary.""" + def apply_value(key: str, value: Any) -> Any: + if isinstance(value, str) and key in self.prefix_suffix_rules: + rules = self.prefix_suffix_rules[key] + if 'prefix' in rules: + value = rules['prefix'] + value + if 'suffix' in rules: + value = value + rules['suffix'] + return value + + if isinstance(data, dict): + return {k: apply_value(k, v) for k, v in data.items()} + elif isinstance(data, list): + return [self._apply_prefix_suffix(item) for item in data] + else: + return data \ No newline at end of file diff --git a/src/utility.py b/src/utility.py new file mode 100644 index 0000000..190f02a --- /dev/null +++ b/src/utility.py @@ -0,0 +1,528 @@ +import concurrent.futures +import importlib.util +import inspect +from functools import reduce +import logging +import os +import sys +from typing import Callable, Dict, List + +import json +import jsonschema as js +import yaml + +from pyspark.sql import DataFrame +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType + +from constants import ( + SupportedSpecFormat, + PipelineBundleSuffixesJson, + PipelineBundleSuffixesYaml +) + + +def get_format_suffixes(file_format: str, suffix_type: str) -> list: + """Get file suffixes based on file format and suffix type. + + This is a centralized utility to retrieve the appropriate file suffixes + for different configuration and specification files based on the format + (JSON or YAML) and the type of file (substitutions, secrets, specs, etc.).""" + suffix_map = { + SupportedSpecFormat.JSON.value: { + "substitutions": PipelineBundleSuffixesJson.SUBSTITUTIONS_FILE_SUFFIX, + "secrets": PipelineBundleSuffixesJson.SECRETS_FILE_SUFFIX, + "main_spec": PipelineBundleSuffixesJson.MAIN_SPEC_FILE_SUFFIX, + "flow_group": PipelineBundleSuffixesJson.FLOW_GROUP_FILE_SUFFIX, + "expectations": PipelineBundleSuffixesJson.EXPECTATIONS_FILE_SUFFIX + }, + SupportedSpecFormat.YAML.value: { + "substitutions": PipelineBundleSuffixesYaml.SUBSTITUTIONS_FILE_SUFFIX, + "secrets": PipelineBundleSuffixesYaml.SECRETS_FILE_SUFFIX, + "main_spec": PipelineBundleSuffixesYaml.MAIN_SPEC_FILE_SUFFIX, + "flow_group": PipelineBundleSuffixesYaml.FLOW_GROUP_FILE_SUFFIX, + "expectations": PipelineBundleSuffixesYaml.EXPECTATIONS_FILE_SUFFIX + } + } + + if file_format not in suffix_map: + valid_formats = list(suffix_map.keys()) + raise ValueError( + f"Invalid file format: '{file_format}'. " + f"Valid formats are: {valid_formats}" + ) + + if suffix_type not in suffix_map[file_format]: + valid_types = list(suffix_map[file_format].keys()) + raise ValueError( + f"Invalid suffix type: '{suffix_type}'. " + f"Valid types are: {valid_types}" + ) + + result = suffix_map[file_format][suffix_type] + + # Always return a list for consistency + if isinstance(result, list): + return result + elif isinstance(result, (tuple, set)): + return list(result) + elif isinstance(result, str): + return [result] + else: + raise TypeError( + f"Invalid suffix type for '{suffix_type}': expected str, tuple, list, or set, " + f"but got {type(result).__name__}" + ) + +class JSONValidator: + """ + A JSON schema validator class. + + Attributes: + schema (dict): The JSON schema loaded from a file. + base_uri (str): The base URI for resolving schema references. + resolver (RefResolver): The JSON schema resolver. + validator (Draft7Validator): The JSON schema validator. + + Methods: + validate(json_data: Dict) -> List: + Validates the provided JSON data against the loaded schema and returns a list of validation errors. + """ + + def __init__(self, schema_path: str): + try: + with open(schema_path, "r", encoding="utf-8") as schema_file: + self.schema = json.load(schema_file) + except Exception as e: + raise ValueError(f"JSON Schema not found: {schema_path}") from e + + # Resolve references + self.base_uri = "file://" + os.path.abspath(os.path.dirname(schema_path)) + "/" + self.resolver = js.RefResolver(base_uri=self.base_uri, referrer=self.schema) + self.validator = js.Draft7Validator(self.schema, resolver=self.resolver) + + def validate(self, json_data: Dict) -> List: + """Validate the provided JSON data against the loaded schema and returns a list of validation errors.""" + return list(self.validator.iter_errors(json_data)) + + +def add_struct_field(struct: StructType, column: Dict): + """Add a field to a StructType schema.""" + return struct.jsonValue()["fields"].append(column) + + +def drop_columns(df: DataFrame, columns_to_drop: List) -> DataFrame: + """Drop columns from a DataFrame.""" + drop_column_list = [] + for column in columns_to_drop: + if column in df.columns: + drop_column_list.append(column) + if drop_column_list: + df = df.drop(*drop_column_list) + return df + + +def load_config_file(file_path: str, file_format: str = "json", fail_on_not_exists: bool = True) -> Dict: + """Load JSON or YAML data from a file.""" + if not os.path.exists(file_path): + if fail_on_not_exists: + raise ValueError(f"Path does not exist: {file_path}") + return {} + + with open(file_path, 'r', encoding='utf-8') as file: + try: + if file_format == "json": + return json.load(file) + elif file_format == "yaml": + return yaml.safe_load(file) + else: + raise ValueError(f"Invalid file format: {file_format}. Only 'json' and 'yaml' are supported.") + except json.JSONDecodeError as e: + raise ValueError( + f"Error loading JSON file '{file_path}': {e.msg} at line {e.lineno}, column {e.colno}" + ) from e + except yaml.YAMLError as e: + raise yaml.YAMLError(f"Error loading YAML file '{file_path}': {e}") + + +def load_config_files( + path: str, + file_format: str = "json", + file_suffix: str | List[str] = None, + recursive: bool = False +) -> Dict: + """Load configuration data from files with a specific suffix.""" + if file_suffix is None: + file_suffix = f".{file_format}" + + data = {} + if not path or path.strip() == "" or not os.path.exists(path): + raise ValueError(f"Path does not exist: {path}") + + file_suffix_list = [file_suffix] if isinstance(file_suffix, str) else file_suffix + + if recursive: + for root, _, filenames in os.walk(path): + for filename in filenames: + if any(filename.endswith(suffix) for suffix in file_suffix_list): + file_path = os.path.join(root, filename) + data[file_path] = load_config_file(file_path, file_format) + else: + for filename in os.listdir(path): + if any(filename.endswith(suffix) for suffix in file_suffix_list): + file_path = os.path.join(path, filename) + data[file_path] = load_config_file(file_path, file_format) + + return data + + +def load_config_file_auto(file_path: str, fail_on_not_exists: bool = True) -> Dict: + """Load JSON or YAML data from a file with automatic format detection. + + The file format is automatically detected based on the file extension: + - .json -> JSON format + - .yaml, .yml -> YAML format + + Args: + file_path: Path to the configuration file + fail_on_not_exists: Whether to raise an error if file doesn't exist + + Returns: + Dict containing the loaded configuration data + + Raises: + ValueError: If file extension is not recognized or path doesn't exist + """ + file_ext = os.path.splitext(file_path)[1].lower() + + if file_ext == '.json': + file_format = 'json' + elif file_ext in ('.yaml', '.yml'): + file_format = 'yaml' + else: + raise ValueError( + f"Unable to detect file format from extension '{file_ext}'. " + f"Supported extensions: .json, .yaml, .yml" + ) + + return load_config_file(file_path, file_format, fail_on_not_exists) + + +# TODO: Legacy wrapper around load_config_file. Remove this in a future release. +def get_json_from_file(file_path: str, fail_on_not_exists: bool = True) -> Dict: + """Load JSON data from a file.""" + return load_config_file(file_path, "json", fail_on_not_exists) + + +# TODO: Legacy wrapper around load_config_files. Remove this in a future release. +def get_json_from_files(path: str, file_suffix: str | List[str] = ".json", recursive: bool = False) -> Dict: + """Load JSON data from files that have a specific suffix.""" + return load_config_files(path, "json", file_suffix, recursive) + + +# TODO: Legacy wrapper around load_config_file. Remove this in a future release. +def get_yaml_from_file(file_path: str, fail_on_not_exists: bool = True) -> Dict: + """Load YAML data from a file.""" + return load_config_file(file_path, "yaml", fail_on_not_exists) + + +# TODO: Legacy wrapper around load_config_files. Remove this in a future release. +def get_yaml_from_files(path: str, file_suffix: str | List[str] = ".yaml", recursive: bool = False) -> Dict: + """Load YAML data from files with a specific suffix.""" + return load_config_files(path, "yaml", file_suffix, recursive) + + +def get_data_from_files_parallel( + path: str, + file_format: str, + file_suffix: str | List[str], + recursive: bool = False, + max_workers: int = 10 +) -> Dict: + """ + Load data from JSON or YAML files that have a specific suffix using parallel processing. + + Args: + path: Directory path to search for files + file_format: File format to load. ["json", "yaml"] + file_suffix: File suffixes to filter by e.g. ".json" or ["_main.json", "_flow.json"] + recursive: Whether to search recursively in subdirectories + max_workers: Maximum number of worker threads for parallel loading + + Returns: + Dict mapping file suffix to a list of file paths to their loaded JSON data + + Example output: + { + ".json": { + "/path/to/file1.json": { + "data": { + "key": "value" + } + }, + "/path/to/file2.json": { + "data": { + "key": "value" + } + } + } + } + + Raises: + ValueError: If the path doesn't exist + """ + def _discover_files(path: str, file_suffix: str | List[str], recursive: bool) -> List[str]: + file_path_list = [] + if recursive: + for root, _, filenames in os.walk(path): + for filename in filenames: + if filename.endswith(file_suffix): + file_path_list.append(os.path.join(root, filename)) + else: + for filename in os.listdir(path): + if filename.endswith(file_suffix): + file_path_list.append(os.path.join(path, filename)) + + return file_path_list + + def load_single_file(file_path): + try: + return file_path, load_config_file(file_path, file_format), None + except Exception as e: + return file_path, None, str(e) + + # Validate path + if not path or path.strip() == "" or not os.path.exists(path): + raise ValueError(f"Path does not exist: {path}") + + file_suffix_list = [file_suffix] if isinstance(file_suffix, str) else file_suffix + file_paths = {} + data = {} + errors = {} + for suffix in file_suffix_list: + file_paths[suffix] = _discover_files(path, suffix, recursive) + + for file_suffix, file_paths in file_paths.items(): + print(f"Loading {file_suffix} files...") + print(f"File paths: {file_paths}") + data[file_suffix] = {} + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_path = { + executor.submit(load_single_file, file_path): file_path + for file_path in file_paths + } + + for future in concurrent.futures.as_completed(future_to_path): + file_path, file_data, error = future.result() + if error: + errors[file_path] = error + print(f"Warning: Failed to load {file_path}: {error}") + elif file_data is not None: + data[file_suffix][file_path] = file_data + + if errors: + print(f"Warning: {len(errors)} files failed to load.") + for file_path, error in errors.items(): + print(f"Warning: {file_path}: {error}") + + return data + + +def get_pipeline_update_id(spark: SparkSession) -> str: + """ + Get the pipeline update id from the spark conf. + This is only populated post initialisation of the pipeline, by an event hook in DltPipelineBuilder. + + Args: + spark (SparkSession): The Spark session to use for the pipeline. + + Returns: + str: The pipeline update id. + """ + return spark.conf.get("pipeline.pipeline_update_id", None) + + +def get_table_versions(spark, source_view_dict: Dict[str, str]) -> DataFrame: + """Get table versions from a Spark DataFrame.""" + df_list = [] + for view_name, source_table in source_view_dict.items(): + sql = f"DESCRIBE HISTORY {source_table} LIMIT 1" + select_expr = [ + f"'{view_name}' AS viewName", + f"'{source_table}' AS tableName", + "version" + ] + df = spark.sql(sql).selectExpr(select_expr) + df_list.append(df) + return reduce(DataFrame.unionAll, df_list) if len(df_list) > 1 else df_list[0] + + +def load_python_function( + python_function_path: str, + function_name: str, + required_params: List[str] = None +) -> Callable: + """Load and validate a Python function from a file.""" + if required_params is None: + required_params = [] + + spec = importlib.util.spec_from_file_location("module", python_function_path) + if not spec or not spec.loader: + raise ImportError(f"Could not load Python function from {python_function_path}") + + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Validate function exists + if not hasattr(module, function_name): + raise AttributeError( + f"Python function file '{python_function_path}' must contain a " + f"'{function_name}' function with parameters: {', '.join(required_params)}" + ) + + function = getattr(module, function_name) + if not callable(function): + raise TypeError(f"'{function_name}' in '{python_function_path}' is not callable") + + # Inspect signature + sig = inspect.signature(function) + func_params = list(sig.parameters.keys()) + missing_params = [p for p in required_params if p not in func_params] + if missing_params: + raise ValueError( + f"Function '{function_name}' in '{python_function_path}' is missing " + f"required parameters: {', '.join(missing_params)}" + ) + + return function + + +def load_python_function_from_module( + python_module: str, + required_params: List[str] = None +) -> Callable: + """ + Load and validate a Python function from an extension module. + + The module must be importable via sys.path (typically from the extensions directory + which is added to sys.path during pipeline initialization). + + Args: + python_module: Module and function reference in format 'module_name.function_name' + (e.g., 'transforms.customer_source' or 'my_module.sub.get_data') + required_params: List of required parameter names for validation + + Returns: + The callable function from the module + + Raises: + ValueError: If python_module format is invalid + ImportError: If module cannot be imported + AttributeError: If function is not found in module + TypeError: If the attribute is not callable + """ + if required_params is None: + required_params = [] + + # Parse module and function name + if "." not in python_module: + raise ValueError( + f"Invalid pythonModule format: '{python_module}'. " + f"Expected format: 'module_name.function_name' (e.g., 'transforms.get_df')" + ) + + # Split on last dot to get module path and function name + module_path, function_name = python_module.rsplit(".", 1) + + # Import the module + try: + module = importlib.import_module(module_path) + except ModuleNotFoundError as e: + raise ImportError( + f"Could not import module '{module_path}' for pythonModule '{python_module}'. " + f"Ensure the module exists in the extensions directory. Original error: {e}" + ) from e + + # Get the function + if not hasattr(module, function_name): + raise AttributeError( + f"Module '{module_path}' does not contain function '{function_name}'. " + f"Available attributes: {[a for a in dir(module) if not a.startswith('_')]}" + ) + + function = getattr(module, function_name) + if not callable(function): + raise TypeError( + f"'{function_name}' in module '{module_path}' is not callable" + ) + + # Validate required parameters + if required_params: + sig = inspect.signature(function) + func_params = list(sig.parameters.keys()) + missing_params = [p for p in required_params if p not in func_params] + if missing_params: + raise ValueError( + f"Function '{function_name}' in module '{module_path}' is missing " + f"required parameters: {', '.join(missing_params)}" + ) + + return function + + +def list_sub_paths(path: str) -> List[str]: + """List subdirectories in a given directory path.""" + return [x for x in os.listdir(path) if os.path.isdir(os.path.join(path, x))] + + +def merge_dicts(*dicts: Dict) -> Dict: + """Merge dictionaries into a single dictionary.""" + return reduce(lambda a, b: {**a, **b} if b is not None else a, dicts, {}) + + +def merge_dicts_recursively(d1: Dict, d2: Dict) -> Dict: + """Recursively merges two dictionaries. Keys in d1 take precedence over d2.""" + d = d1.copy() + + for key in d2: + if key in d1: + if isinstance(d1[key], dict) and isinstance(d2[key], dict): + d[key] = merge_dicts_recursively(d1[key], d2[key]) + else: + d[key] = d2[key] + + return d + + +def replace_dict_key_value(spec: Dict, target_key: str, new_value: str) -> Dict: + """Replace values of a specific key in a nested dictionary.""" + if isinstance(spec, dict): + for key, value in spec.items(): + if key == target_key: + if spec[key] is not None and spec[key].strip() != "": + spec[key] = f"{new_value}/{spec[key]}" + elif isinstance(value, dict) or isinstance(value, list): + replace_dict_key_value(value, target_key, new_value) + elif isinstance(spec, list): + for item in spec: + replace_dict_key_value(item, target_key, new_value) + return spec + + +def set_logger(logger_name: str, log_level: str = "INFO") -> logging.Logger: + """Set up and return a logger with a specified name and log level.""" + logger = logging.getLogger(logger_name) + log_level = getattr(logging, log_level, logging.INFO) + logger.setLevel(log_level) + + # Clear existing handlers to avoid duplicate logging + if logger.hasHandlers(): + logger.handlers.clear() + + # Add a new handler + console_output_handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + console_output_handler.setFormatter(formatter) + logger.addHandler(console_output_handler) + + return logger