Skip to content
This repository was archived by the owner on Aug 8, 2025. It is now read-only.

Commit c9f23be

Browse files
committed
Add in example for reference mode for YAML, update to 0.16.3
1 parent be6bfca commit c9f23be

6 files changed

Lines changed: 300 additions & 2 deletions

File tree

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
FROM datacatering/data-caterer:0.16.1
1+
FROM datacatering/data-caterer:0.16.3
22

33
COPY --chown=app:app build/libs/data-caterer-example-0.1.0.jar /opt/app/job.jar
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: "new_features_example_plan"
2+
description: "Example showcasing new features: enableFastGeneration, reference mode, and deeply nested fields with SQL"
3+
tasks:
4+
- name: "reference_data_task"
5+
dataSourceName: "csv"
6+
enabled: true
7+
- name: "complex_financial_data_task"
8+
dataSourceName: "json"
9+
enabled: true
10+
11+
sinkOptions:
12+
# enableFastGeneration flag for improved performance (commented out - may not be available in this version)
13+
# enableFastGeneration: true
14+
15+
# Reference mode - using reference data in foreign key relationships
16+
foreignKeys:
17+
- source:
18+
dataSource: "csv"
19+
step: "reference_data"
20+
fields: ["country_code", "currency_code"]
21+
generate:
22+
- dataSource: "json"
23+
step: "complex_financial_data"
24+
fields: ["customer_info.address_info.country_code", "account_details.currency_code"]
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
name: "complex_financial_data_task"
2+
steps:
3+
- name: "complex_financial_data"
4+
type: "json"
5+
count:
6+
records: 100
7+
options:
8+
path: "/opt/app/data/complex"
9+
saveMode: "overwrite"
10+
fields:
11+
- name: "account_id"
12+
type: "string"
13+
options:
14+
regex: "ACC[0-9]{8}"
15+
isUnique: true
16+
- name: "created_date"
17+
type: "date"
18+
options:
19+
min: "2023-01-01"
20+
21+
# Deeply nested customer information with SQL generation
22+
- name: "customer_info"
23+
type: "struct"
24+
fields:
25+
- name: "customer_id"
26+
type: "string"
27+
options:
28+
regex: "CUST[0-9]{10}"
29+
- name: "personal_details"
30+
type: "struct"
31+
fields:
32+
- name: "full_name"
33+
type: "string"
34+
options:
35+
expression: "#{Name.fullName}"
36+
- name: "first_name"
37+
type: "string"
38+
options:
39+
sql: "SPLIT(customer_info.personal_details.full_name, ' ')[0]"
40+
- name: "last_name"
41+
type: "string"
42+
options:
43+
sql: "SPLIT(customer_info.personal_details.full_name, ' ')[1]"
44+
- name: "email"
45+
type: "string"
46+
options:
47+
sql: "LOWER(CONCAT(customer_info.personal_details.first_name, '.', customer_info.personal_details.last_name, '@datacatering.com'))"
48+
- name: "birth_date"
49+
type: "date"
50+
options:
51+
min: "1950-01-01"
52+
max: "2000-12-31"
53+
- name: "age"
54+
type: "integer"
55+
options:
56+
sql: "YEAR(CURRENT_DATE()) - YEAR(customer_info.personal_details.birth_date)"
57+
- name: "age_group"
58+
type: "string"
59+
options:
60+
sql: "CASE WHEN customer_info.personal_details.age < 25 THEN 'Young Adult' WHEN customer_info.personal_details.age < 40 THEN 'Adult' WHEN customer_info.personal_details.age < 60 THEN 'Middle Age' ELSE 'Senior' END"
61+
- name: "address_info"
62+
type: "struct"
63+
fields:
64+
- name: "street_address"
65+
type: "string"
66+
options:
67+
expression: "#{Address.streetAddress}"
68+
- name: "city"
69+
type: "string"
70+
options:
71+
expression: "#{Address.city}"
72+
- name: "state"
73+
type: "string"
74+
options:
75+
expression: "#{Address.state}"
76+
- name: "postal_code"
77+
type: "string"
78+
options:
79+
expression: "#{Address.zipCode}"
80+
- name: "country_code"
81+
type: "string"
82+
# Will be linked via foreign key to reference data
83+
- name: "full_address"
84+
type: "string"
85+
options:
86+
sql: "CONCAT(customer_info.address_info.street_address, ', ', customer_info.address_info.city, ', ', customer_info.address_info.state, ' ', customer_info.address_info.postal_code)"
87+
- name: "is_urban"
88+
type: "boolean"
89+
options:
90+
sql: "CASE WHEN customer_info.address_info.city IN ('New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix') THEN true ELSE false END"
91+
- name: "contact_preferences"
92+
type: "struct"
93+
fields:
94+
- name: "email_marketing"
95+
type: "boolean"
96+
options:
97+
sql: "CASE WHEN RAND() > 0.3 THEN true ELSE false END"
98+
- name: "sms_notifications"
99+
type: "boolean"
100+
options:
101+
sql: "CASE WHEN RAND() > 0.5 THEN true ELSE false END"
102+
- name: "phone_calls"
103+
type: "boolean"
104+
options:
105+
sql: "CASE WHEN RAND() > 0.7 THEN true ELSE false END"
106+
- name: "preferred_contact_time"
107+
type: "string"
108+
options:
109+
sql: "CASE WHEN RAND() < 0.3 THEN 'Morning' WHEN RAND() < 0.6 THEN 'Afternoon' ELSE 'Evening' END"
110+
111+
# Account details with complex calculations
112+
- name: "account_details"
113+
type: "struct"
114+
fields:
115+
- name: "account_type"
116+
type: "string"
117+
options:
118+
oneOf:
119+
- "CHECKING"
120+
- "SAVINGS"
121+
- "INVESTMENT"
122+
- "CREDIT"
123+
- name: "balance"
124+
type: "decimal"
125+
options:
126+
precision: 15
127+
scale: 2
128+
min: 0
129+
max: 1000000
130+
- name: "currency_code"
131+
type: "string"
132+
# Will be linked via foreign key to reference data
133+
- name: "interest_rate"
134+
type: "decimal"
135+
options:
136+
sql: "CASE WHEN account_details.account_type = 'CHECKING' THEN 0.01 WHEN account_details.account_type = 'SAVINGS' THEN 0.02 WHEN account_details.account_type = 'INVESTMENT' THEN 0.05 ELSE 0.18 END"
137+
- name: "monthly_fee"
138+
type: "decimal"
139+
options:
140+
sql: "CASE WHEN account_details.account_type = 'CHECKING' AND account_details.balance < 1000 THEN 10.00 WHEN account_details.account_type = 'SAVINGS' AND account_details.balance < 500 THEN 5.00 ELSE 0.00 END"
141+
- name: "annual_interest"
142+
type: "decimal"
143+
options:
144+
sql: "account_details.balance * account_details.interest_rate"
145+
- name: "account_status"
146+
type: "string"
147+
options:
148+
sql: "CASE WHEN account_details.balance > 100000 THEN 'PREMIUM' WHEN account_details.balance > 10000 THEN 'GOLD' WHEN account_details.balance > 1000 THEN 'STANDARD' ELSE 'BASIC' END"
149+
150+
# Transaction history with array of complex objects
151+
- name: "transaction_history"
152+
type: "array"
153+
options:
154+
arrayMinLen: 5
155+
arrayMaxLen: 20
156+
fields:
157+
- name: "transaction_id"
158+
type: "string"
159+
options:
160+
regex: "TXN[0-9]{12}"
161+
- name: "timestamp"
162+
type: "timestamp"
163+
options:
164+
min: "2023-01-01T00:00:00Z"
165+
max: "2024-12-31T23:59:59Z"
166+
- name: "amount"
167+
type: "decimal"
168+
options:
169+
precision: 10
170+
scale: 2
171+
min: -5000
172+
max: 5000
173+
- name: "transaction_type"
174+
type: "string"
175+
options:
176+
sql: "CASE WHEN transaction_history.amount > 0 THEN 'CREDIT' ELSE 'DEBIT' END"
177+
- name: "category"
178+
type: "string"
179+
options:
180+
sql: "CASE WHEN transaction_history.transaction_type = 'CREDIT' THEN CASE WHEN RAND() < 0.3 THEN 'SALARY' WHEN RAND() < 0.6 THEN 'INVESTMENT' ELSE 'TRANSFER' END ELSE CASE WHEN RAND() < 0.2 THEN 'GROCERIES' WHEN RAND() < 0.4 THEN 'ENTERTAINMENT' WHEN RAND() < 0.6 THEN 'UTILITIES' WHEN RAND() < 0.8 THEN 'TRANSPORTATION' ELSE 'MISC' END END"
181+
- name: "merchant_info"
182+
type: "struct"
183+
fields:
184+
# - name: "name"
185+
# type: "string"
186+
# options:
187+
# expression: "#{Company.name}"
188+
- name: "category"
189+
type: "string"
190+
options:
191+
sql: "transaction_history.category"
192+
- name: "location"
193+
type: "string"
194+
options:
195+
sql: "CONCAT(customer_info.address_info.city, ', ', customer_info.address_info.state)"
196+
- name: "is_large_transaction"
197+
type: "boolean"
198+
options:
199+
sql: "ABS(transaction_history.amount) > 1000"
200+
- name: "day_of_week"
201+
type: "string"
202+
options:
203+
sql: "DATE_FORMAT(transaction_history.timestamp, 'EEEE')"
204+
- name: "is_weekend"
205+
type: "boolean"
206+
options:
207+
sql: "DATE_FORMAT(transaction_history.timestamp, 'EEEE') IN ('Saturday', 'Sunday')"
208+
209+
# Risk assessment with complex nested calculations
210+
- name: "risk_assessment"
211+
type: "struct"
212+
fields:
213+
- name: "risk_score"
214+
type: "decimal"
215+
options:
216+
sql: "CASE WHEN customer_info.personal_details.age < 25 THEN 0.8 WHEN customer_info.personal_details.age < 40 THEN 0.5 WHEN customer_info.personal_details.age < 60 THEN 0.3 ELSE 0.4 END * CASE WHEN account_details.balance > 100000 THEN 0.2 WHEN account_details.balance > 10000 THEN 0.5 ELSE 0.8 END"
217+
- name: "risk_category"
218+
type: "string"
219+
options:
220+
sql: "CASE WHEN risk_assessment.risk_score < 0.3 THEN 'LOW' WHEN risk_assessment.risk_score < 0.6 THEN 'MEDIUM' ELSE 'HIGH' END"
221+
- name: "credit_limit"
222+
type: "decimal"
223+
options:
224+
sql: "CASE WHEN risk_assessment.risk_category = 'LOW' THEN account_details.balance * 5 WHEN risk_assessment.risk_category = 'MEDIUM' THEN account_details.balance * 3 ELSE account_details.balance * 1.5 END"
225+
- name: "requires_manual_review"
226+
type: "boolean"
227+
options:
228+
sql: "risk_assessment.risk_score > 0.7 OR account_details.balance > 500000"
229+
- name: "last_review_date"
230+
type: "date"
231+
options:
232+
sql: "CASE WHEN risk_assessment.requires_manual_review THEN DATE_SUB(CURRENT_DATE(), 30) ELSE DATE_SUB(CURRENT_DATE(), 90) END"
233+
234+
# Aggregated statistics with complex calculations
235+
- name: "account_statistics"
236+
type: "struct"
237+
fields:
238+
- name: "total_transactions"
239+
type: "integer"
240+
options:
241+
sql: "SIZE(transaction_history)"
242+
- name: "total_credits"
243+
type: "integer"
244+
options:
245+
sql: "SIZE(FILTER(transaction_history, x -> x.transaction_type = 'CREDIT'))"
246+
- name: "total_debits"
247+
type: "integer"
248+
options:
249+
sql: "SIZE(FILTER(transaction_history, x -> x.transaction_type = 'DEBIT'))"
250+
- name: "weekend_transaction_ratio"
251+
type: "decimal"
252+
options:
253+
sql: "SIZE(FILTER(transaction_history, x -> x.is_weekend)) / SIZE(transaction_history)"
254+
- name: "account_age_days"
255+
type: "integer"
256+
options:
257+
sql: "DATEDIFF(CURRENT_DATE(), created_date)"
258+
- name: "daily_avg_balance"
259+
type: "decimal"
260+
options:
261+
sql: "account_details.balance / account_statistics.account_age_days"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
name: "reference_data_task"
2+
steps:
3+
- name: "reference_data"
4+
type: "csv"
5+
options:
6+
path: "/opt/app/mount/reference/country_data.csv"
7+
header: true
8+
enableReferenceMode: true
9+
enableDataGeneration: false
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
country_code,country_name,currency_code,timezone
2+
AU,Australia,AUD,Australia/Sydney
3+
US,United States,USD,America/New_York
4+
CA,Canada,CAD,America/Toronto

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ version=0.1.0
88

99
scalaVersion=2.12
1010
scalaSpecificVersion=2.12.19
11-
dataCatererVersion=0.16.1
11+
dataCatererVersion=0.16.3
1212
sparkMajorVersion=3.5

0 commit comments

Comments
 (0)