Skip to content

Commit d6be48f

Browse files
authored
Merge pull request #254 from VisActor/feat/regression-lines
feat: add regression lines and dataset transform bin
2 parents 416e435 + 4be586a commit d6be48f

File tree

15 files changed

+1624
-63
lines changed

15 files changed

+1624
-63
lines changed
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import bin from '../src/transform/bin';
2+
3+
describe('bin transform', () => {
4+
test('basic bins by count', () => {
5+
const data = [] as any[];
6+
for (let i = 0; i < 100; i++) {
7+
data.push({ v: i });
8+
}
9+
const bins = bin(data, { field: 'v', bins: 10 });
10+
expect(bins.length).toBe(10);
11+
const total = bins.reduce((s: number, b: any) => s + b.count, 0);
12+
expect(total).toBe(100);
13+
// each bin should have roughly 10 counts
14+
for (let i = 0; i < bins.length; i++) {
15+
expect(bins[i].count).toBeGreaterThanOrEqual(8);
16+
expect(bins[i].count).toBeLessThanOrEqual(12);
17+
}
18+
});
19+
20+
test('step produces correct bin widths', () => {
21+
const data = [] as any[];
22+
for (let i = 0; i <= 20; i++) {
23+
data.push({ v: i });
24+
}
25+
const bins = bin(data, { field: 'v', step: 5 });
26+
// step=5 -> bins covering [min,max] with width 5 => expect 5 bins (0-4,5-9,10-14,15-19,20)
27+
expect(bins.length).toBeGreaterThanOrEqual(4);
28+
// ensure bin edges spacing equals 5 or last smaller
29+
for (let i = 0; i < bins.length - 1; i++) {
30+
const w = bins[i].x1 - bins[i].x0;
31+
expect(w).toBeCloseTo(5, 12);
32+
}
33+
});
34+
35+
test('explicit thresholds control bin edges', () => {
36+
const data = [{ v: 1 }, { v: 3 }, { v: 7 }, { v: 12 }];
37+
const thresholds = [0, 5, 10, 20];
38+
const bins = bin(data, { field: 'v', thresholds });
39+
expect(bins.length).toBe(3);
40+
expect(bins[0].count).toBe(2); // 1 and 3
41+
expect(bins[1].count).toBe(1); // 7
42+
expect(bins[2].count).toBe(1); // 12
43+
});
44+
45+
test('extent overrides and includeValues', () => {
46+
const data = [
47+
{ v: 2, id: 'a' },
48+
{ v: 8, id: 'b' }
49+
];
50+
const bins = bin(data, { field: 'v', step: 5, extent: [0, 10], includeValues: true });
51+
expect(bins.length).toBeGreaterThan(0);
52+
const values = bins.flatMap((b: any) => (b.values ? b.values.map((it: any) => it.id) : []));
53+
expect(values).toContain('a');
54+
expect(values).toContain('b');
55+
});
56+
57+
test('empty data returns empty array', () => {
58+
const out = bin([], { field: 'x', bins: 5 });
59+
expect(Array.isArray(out)).toBeTruthy();
60+
expect(out.length).toBe(0);
61+
});
62+
});

packages/vdataset/src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ export type { IMapOptions } from './transform/map';
1717
export { fold } from './transform/fold';
1818
export type { IFoldOptions } from './transform/fold';
1919
export { fields } from './transform/fields';
20+
export type { IBinOptions } from './transform/bin';
21+
export { bin } from './transform/bin';
2022
export type { IFieldsOptions } from './transform/fields';
2123
// transformType
2224
export * from './transform/index';
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import { isNil } from '@visactor/vutils';
2+
import type { Transform } from '.';
3+
4+
export interface IBinOptions {
5+
field: string; // numeric field to bin
6+
bins?: number; // number of bins (default 10)
7+
thresholds?: number[]; // explicit bin edges
8+
step?: number; // optional fixed bin width (interval step). If provided, overrides bins.
9+
extent?: [number, number]; // optional [min, max] to use instead of data-driven
10+
includeValues?: boolean; // whether to keep the original items in each bin
11+
}
12+
13+
/**
14+
* Bin transform: converts numeric field into histogram bins.
15+
* Returns an array of bins: { x0, x1, count, values? }
16+
*/
17+
export const bin: Transform = (data: Array<object>, options?: IBinOptions) => {
18+
const field = options?.field;
19+
if (!field) {
20+
return [];
21+
}
22+
23+
const n = data.length;
24+
// compute data-driven extent
25+
let min = Infinity;
26+
let max = -Infinity;
27+
28+
if (options?.extent) {
29+
min = options.extent[0];
30+
max = options.extent[1];
31+
} else {
32+
for (let i = 0; i < n; i++) {
33+
const v: any = (data[i] as any)[field];
34+
if (isNil(v)) {
35+
continue;
36+
}
37+
const num = +v;
38+
if (Number.isFinite(num)) {
39+
if (num < min) {
40+
min = num;
41+
}
42+
if (num > max) {
43+
max = num;
44+
}
45+
}
46+
}
47+
}
48+
49+
if (!Number.isFinite(min) || !Number.isFinite(max) || n === 0) {
50+
return [];
51+
}
52+
53+
// build thresholds
54+
let thresholds: number[] | undefined;
55+
if (options && options.thresholds && options.thresholds.length) {
56+
// explicit thresholds provided by user
57+
thresholds = options.thresholds.slice();
58+
thresholds.sort((a, b) => a - b);
59+
} else if (options && typeof options.step === 'number' && options.step > 0) {
60+
// fixed bin width (step) provided: compute number of bins to cover [min, max]
61+
const stepSize = options.step;
62+
let startMin = min;
63+
64+
if (!options.extent) {
65+
startMin = Math.floor(min / stepSize) * stepSize;
66+
}
67+
thresholds = [startMin];
68+
69+
while (startMin < max) {
70+
startMin += stepSize;
71+
thresholds.push(startMin);
72+
}
73+
} else {
74+
// fallback to bins count (default 10)
75+
const bins = options?.bins && options.bins > 0 ? Math.floor(options.bins) : 10;
76+
const stepSize = (max - min) / bins;
77+
thresholds = new Array(bins + 1);
78+
for (let i = 0; i <= bins; i++) {
79+
thresholds[i] = i === bins ? max : min + stepSize * i;
80+
}
81+
}
82+
83+
const numBins = Math.max(0, thresholds.length - 1);
84+
if (numBins === 0) {
85+
return [];
86+
}
87+
88+
const out: any[] = new Array(numBins);
89+
for (let i = 0; i < numBins; i++) {
90+
out[i] = { x0: thresholds[i], x1: thresholds[i + 1], count: 0 };
91+
if (options?.includeValues) {
92+
out[i].values = [] as object[];
93+
}
94+
}
95+
96+
// assign each datum to a bin (left-inclusive, right-exclusive except last bin includes max)
97+
for (let i = 0; i < n; i++) {
98+
const v: any = (data[i] as any)[field];
99+
if (v == null) {
100+
continue;
101+
}
102+
const num = +v;
103+
if (!Number.isFinite(num)) {
104+
continue;
105+
}
106+
107+
// find bin index (linear scan is fine for moderate bin counts)
108+
for (let j = 0; j < numBins; j++) {
109+
const left = out[j].x0;
110+
const right = out[j].x1;
111+
const isLast = j === numBins - 1;
112+
if ((num >= left && num < right) || (isLast && num <= right)) {
113+
out[j].count++;
114+
if (options && options.includeValues) {
115+
out[j].values.push(data[i]);
116+
}
117+
break;
118+
}
119+
}
120+
}
121+
122+
return out;
123+
};
124+
125+
export default bin;

packages/vutils/__tests__/common/regression-linear.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ test('regressionLinear()', function () {
1212
}
1313
];
1414
const res = regressionLinear(arr);
15-
expect(res.coef).toEqual([0, 2]);
15+
expect(res.coef).toEqual({ a: 0, b: 2 });
1616
expect(res.predict(1)).toBeCloseTo(2);
1717
});
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import ecdf from '../src/common/ecdf';
2+
3+
describe('ecdf', () => {
4+
test('evaluate single and array and grid', () => {
5+
const data = [1, 2, 2, 3];
6+
const e = ecdf(data);
7+
expect(e.n).toBe(4);
8+
expect(e.evaluate(2)).toBeCloseTo(3 / 4);
9+
expect(e.evaluate([0, 2, 4])).toEqual([0, 3 / 4, 1]);
10+
const g = e.evaluateGrid(5);
11+
expect(g.length).toBe(5);
12+
// monotone non-decreasing
13+
for (let i = 1; i < g.length; i++) {
14+
expect(g[i].y).toBeGreaterThanOrEqual(g[i - 1].y);
15+
}
16+
});
17+
18+
test('empty and constant data', () => {
19+
const e0 = ecdf([]);
20+
expect(e0.n).toBe(0);
21+
expect(e0.evaluate(1)).toBe(0);
22+
const e1 = ecdf([5, 5, 5]);
23+
const g = e1.evaluateGrid(3);
24+
expect(g.every(p => p.x === 5)).toBeTruthy();
25+
expect(g.every(c => c.y === 1)).toBeTruthy();
26+
});
27+
28+
test('evaluate array and single consistent', () => {
29+
const data = [3, 1, 4, 1, 5];
30+
const model = ecdf(data);
31+
const x = 2;
32+
const a = model.evaluate(x) as number;
33+
const b = (model.evaluate([x]) as number[])[0];
34+
expect(a).toBeCloseTo(b, 12);
35+
});
36+
});
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import { epanechnikov, gaussian, kde, scott, silverman } from '../src/common/kde';
2+
3+
describe('kde', () => {
4+
test('evaluate and grid', () => {
5+
const data = [1, 2, 3, 4, 5];
6+
const model = kde(data, { bandwidthMethod: 'scott' });
7+
expect(typeof model.bandwidth).toBe('number');
8+
const v = model.evaluate(3);
9+
expect(typeof v).toBe('number');
10+
const arr = model.evaluate([1, 2, 3]);
11+
expect(Array.isArray(arr)).toBeTruthy();
12+
const g = model.evaluateGrid(10);
13+
expect(Array.isArray(g)).toBeTruthy();
14+
expect(g.length).toBe(10);
15+
});
16+
17+
test('constant data returns zeros density when bandwidth is zero', () => {
18+
const data = [5, 5, 5];
19+
const model = kde(data, { bandwidth: 0 });
20+
const g = model.evaluateGrid(3);
21+
expect(g.length).toBe(3);
22+
expect(g.every(d => d.y === 0)).toBeTruthy();
23+
});
24+
});
25+
26+
describe('kde', () => {
27+
test('basic gaussian kde returns higher density near samples (evaluator API)', () => {
28+
const data = [0, 0, 1, 2, 3];
29+
const points = [-1, 0, 0.5, 1, 2, 4];
30+
const model = kde(data, { kernel: gaussian, bandwidth: 0.5 });
31+
const densities = model.evaluate(points) as number[];
32+
// highest density should be at 0 or 1 (near data points)
33+
const maxIdx = densities.indexOf(Math.max(...densities));
34+
expect(points[maxIdx]).toBeGreaterThanOrEqual(0);
35+
expect(densities.length).toBe(points.length);
36+
});
37+
38+
test('epanechnikov kernel gives finite densities and respects bandwidth selectors (evaluator)', () => {
39+
const data = [1, 2, 3, 4, 5];
40+
const points = [0, 1, 2, 3, 4, 5, 6];
41+
const model1 = kde(data, { kernel: epanechnikov, bandwidthMethod: 'scott' });
42+
const model2 = kde(data, { kernel: epanechnikov, bandwidthMethod: 'silverman' });
43+
const d1 = model1.evaluate(points) as number[];
44+
const d2 = model2.evaluate(points) as number[];
45+
expect(d1.length).toBe(points.length);
46+
expect(d2.length).toBe(points.length);
47+
for (let i = 0; i < d1.length; i++) {
48+
expect(isFinite(d1[i])).toBe(true);
49+
expect(isFinite(d2[i])).toBe(true);
50+
}
51+
});
52+
53+
test('bandwidth helpers roughly scale with n and std', () => {
54+
const s = 2;
55+
const h1 = scott(100, s);
56+
const h2 = silverman(100, s);
57+
expect(h1).toBeGreaterThan(0);
58+
expect(h2).toBeGreaterThan(0);
59+
});
60+
61+
test('evaluateGrid returns N points and densities', () => {
62+
const data = [0, 1, 2];
63+
const model = kde(data, { kernel: gaussian, bandwidth: 0.5 });
64+
const res = model.evaluateGrid(5);
65+
expect(res.length).toBe(5);
66+
// points should be in increasing order
67+
for (let i = 1; i < res.length; i++) {
68+
expect(res[i].x).toBeGreaterThanOrEqual(res[i - 1].x);
69+
}
70+
});
71+
72+
test('evaluate(single) equals evaluate([single]) and is finite', () => {
73+
const data = [0, 1, 2, 3];
74+
const model = kde(data, { kernel: gaussian });
75+
const x = 1.3;
76+
const a = model.evaluate(x) as number;
77+
const b = (model.evaluate([x]) as number[])[0];
78+
expect(typeof a).toBe('number');
79+
expect(Number.isFinite(a)).toBe(true);
80+
expect(a).toBeCloseTo(b, 12);
81+
});
82+
83+
test('constant data evaluateGrid returns repeated point and same densities', () => {
84+
const data = [5, 5, 5];
85+
const model = kde(data, { kernel: gaussian });
86+
const res = model.evaluateGrid(4);
87+
expect(res.length).toBe(4);
88+
// all points should equal 5 and all densities should be equal
89+
for (let i = 0; i < res.length; i++) {
90+
expect(res[i].x).toBe(5);
91+
expect(res[i].y).toBeCloseTo(res[0].y, 12);
92+
}
93+
});
94+
95+
test('kernels produce non-negative finite densities and bandwidth present', () => {
96+
const data = [0, 2, 4, 6];
97+
const modelG = kde(data, { kernel: gaussian });
98+
const modelE = kde(data, { kernel: epanechnikov });
99+
const testPoints = [0, 1, 2, 3, 4];
100+
const dg = modelG.evaluate(testPoints) as number[];
101+
const de = modelE.evaluate(testPoints) as number[];
102+
for (let i = 0; i < testPoints.length; i++) {
103+
expect(Number.isFinite(dg[i])).toBe(true);
104+
expect(dg[i]).toBeGreaterThanOrEqual(0);
105+
expect(Number.isFinite(de[i])).toBe(true);
106+
expect(de[i]).toBeGreaterThanOrEqual(0);
107+
}
108+
expect(modelG.bandwidth).toBeGreaterThan(0);
109+
expect(modelE.bandwidth).toBeGreaterThan(0);
110+
});
111+
});

0 commit comments

Comments
 (0)