Skip to content

Fetch from products.json endpoints #19

Fetch from products.json endpoints

Fetch from products.json endpoints #19

name: Fetch from products.json endpoints
on:
# Run weekly on Wednesday at 15:00 UTC (7 AM PST / 8 AM PDT)
schedule:
- cron: '0 15 * * 3'
# Allow manual trigger
workflow_dispatch:
permissions:
contents: write # Required for committing changes
issues: write # Required for creating issues on failure
concurrency:
group: data-branch-writer
cancel-in-progress: false
jobs:
extract-and-process:
runs-on: ubuntu-latest
env:
PAT_TOKEN: ${{ secrets.PAT_TOKEN }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
token: ${{ env.PAT_TOKEN != '' && env.PAT_TOKEN || github.token }}
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Create .env file
run: |
echo "CONVEX_URL=${{ secrets.CONVEX_URL }}" > .env
- name: Run pipeline
id: pipeline
continue-on-error: true
run: |
set +e # Don't exit on error
npm run pipeline
EXIT_CODE=$?
echo "πŸ” Pipeline exit code: $EXIT_CODE"
echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
exit $EXIT_CODE
- name: Update latest symlinks
if: steps.pipeline.outputs.exit_code == '0'
run: npm run update-symlinks
- name: Get current date
id: date
run: echo "date=$(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
- name: Commit and push results
if: steps.pipeline.outputs.exit_code == '0'
run: |
DATA_BRANCH="data"
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
# Debug: Show what files exist
echo "πŸ“‚ Checking data directory contents:"
ls -la data/ || echo "data/ directory doesn't exist"
# Prepare a separate worktree for the data branch so we don't push artifacts to main.
git fetch origin "${DATA_BRANCH}" || true
rm -rf ../data-branch
if git show-ref --verify --quiet "refs/remotes/origin/${DATA_BRANCH}"; then
git worktree add ../data-branch "origin/${DATA_BRANCH}"
else
git worktree add ../data-branch --detach
cd ../data-branch
git checkout --orphan "${DATA_BRANCH}"
git rm -rf . || true
cd -
fi
rsync -a --delete --exclude '.git' data/ ../data-branch/data/
cd ../data-branch
cat > README.md << 'EOF'
This branch contains generated YogaMatLabData pipeline outputs under `data/`.
It is updated automatically by GitHub Actions.
EOF
git add -A README.md
# Use --force to override .gitignore (data/ is ignored locally but not in CI)
# Exclude volatile per-run state files that cause merge conflicts when multiple runs happen.
git add --force data/ ':(exclude)data/.hash-registry.json' ':(exclude)data/changes/latest-changeset.json'
# Debug: Show what's staged
echo "πŸ“¦ Staged changes:"
git status
# Check if there are changes to commit
if git diff --staged --quiet; then
echo "⚠️ No changes to commit"
else
echo "βœ… Changes detected, preparing commit..."
# Read changeset summary for commit message
CHANGESET_FILE="data/changes/latest-changeset.json"
if [ -f "$CHANGESET_FILE" ]; then
NEW_PRODUCTS=$(jq -r '.summary.newProducts' "$CHANGESET_FILE")
REMOVED_PRODUCTS=$(jq -r '.summary.removedProducts' "$CHANGESET_FILE")
PRICE_CHANGES=$(jq -r '.summary.priceChanges' "$CHANGESET_FILE")
TOTAL=$(jq -r '.summary.totalChanges' "$CHANGESET_FILE")
cat > /tmp/commit_msg.txt << 'EOF'
Data update: ${{ steps.date.outputs.date }}
Changes detected:
- New products: ${NEW_PRODUCTS}
- Removed products: ${REMOVED_PRODUCTS}
- Price changes: ${PRICE_CHANGES}
- Total changes: ${TOTAL}
Generated with YogaMatLab Data Pipeline
Run: ${{ github.run_number }}
EOF
# Substitute variables
sed -i "s/\${NEW_PRODUCTS}/$NEW_PRODUCTS/g" /tmp/commit_msg.txt
sed -i "s/\${REMOVED_PRODUCTS}/$REMOVED_PRODUCTS/g" /tmp/commit_msg.txt
sed -i "s/\${PRICE_CHANGES}/$PRICE_CHANGES/g" /tmp/commit_msg.txt
sed -i "s/\${TOTAL}/$TOTAL/g" /tmp/commit_msg.txt
git commit -F /tmp/commit_msg.txt
else
git commit -m "Data update: ${{ steps.date.outputs.date }}" \
-m "Generated with YogaMatLab Data Pipeline" \
-m "Run: ${{ github.run_number }}"
fi
git push origin HEAD:"${DATA_BRANCH}"
fi
- name: Upload extraction logs
if: always()
uses: actions/upload-artifact@v4
with:
name: extraction-logs-${{ steps.date.outputs.date }}
path: logs/
retention-days: 30
- name: Create issue on failure
if: steps.pipeline.outputs.exit_code != '0'
uses: actions/github-script@v7
with:
script: |
const date = '${{ steps.date.outputs.date }}';
const runUrl = `https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}`;
const runNumber = '${{ github.run_number }}';
const body = [
`The daily extraction pipeline failed on ${date}.`,
'',
`**Run details:** ${runUrl}`,
'',
`**Date:** ${date}`,
`**Run number:** ${runNumber}`,
'',
'Please check the logs for details.',
'',
'This issue was automatically created by GitHub Actions.'
].join('\n');
github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `Pipeline failed: ${date}`,
body: body,
labels: ['pipeline-failure', 'automated']
});
- name: Post summary
if: always()
run: |
echo "## Pipeline Execution Summary" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Date:** ${{ steps.date.outputs.date }}" >> $GITHUB_STEP_SUMMARY
echo "**Status:** ${{ steps.pipeline.outputs.exit_code == '0' && 'βœ… Success' || '❌ Failed' }}" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
if [ -f "data/aggregated/${{ steps.date.outputs.date }}/stats.json" ]; then
echo "### Statistics" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "- **Total Products:** $(jq -r '.totalProducts' data/aggregated/${{ steps.date.outputs.date }}/stats.json)" >> $GITHUB_STEP_SUMMARY
echo "- **Total Brands:** $(jq -r '.totalBrands' data/aggregated/${{ steps.date.outputs.date }}/stats.json)" >> $GITHUB_STEP_SUMMARY
echo "- **Price Range:** \$$(jq -r '.priceStats.min' data/aggregated/${{ steps.date.outputs.date }}/stats.json) - \$$(jq -r '.priceStats.max' data/aggregated/${{ steps.date.outputs.date }}/stats.json)" >> $GITHUB_STEP_SUMMARY
fi
if [ -f "data/changes/latest-changeset.json" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Changes" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "- **New Products:** $(jq -r '.summary.newProducts' data/changes/latest-changeset.json)" >> $GITHUB_STEP_SUMMARY
echo "- **Removed Products:** $(jq -r '.summary.removedProducts' data/changes/latest-changeset.json)" >> $GITHUB_STEP_SUMMARY
echo "- **Price Changes:** $(jq -r '.summary.priceChanges' data/changes/latest-changeset.json)" >> $GITHUB_STEP_SUMMARY
fi
# Show per-brand breakdown
if [ -f "data/raw/${{ steps.date.outputs.date }}/_summary.json" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Brand Breakdown" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
# Get brands list and iterate
jq -r '.results[] | "\(.brandName)|\(.success)|\(.productCount)|\(.error // "")"' "data/raw/${{ steps.date.outputs.date }}/_summary.json" | while IFS='|' read -r brand success count error; do
if [ "$success" = "true" ]; then
echo "- βœ… **$brand**: $count products" >> $GITHUB_STEP_SUMMARY
else
if [ -n "$error" ]; then
echo "- ❌ **$brand**: 0 products (${error})" >> $GITHUB_STEP_SUMMARY
else
echo "- ❌ **$brand**: 0 products (failed)" >> $GITHUB_STEP_SUMMARY
fi
fi
done
fi