Skip to content

Commit a1d42c4

Browse files
authored
Add network chaos to fork tests (#1570)
* Add network chaos to fork tests * Set back to 20 epochs * Add streams to fork test * Add DB chaos by locking DB file * Refactor chaos * Clone workers in streams * Temporarily turn on device sync and multiple client instances * Properly catch missing groups * Revert "Temporarily turn on device sync and multiple client instances" This reverts commit 76fcd9b.
1 parent bbf878e commit a1d42c4

File tree

28 files changed

+1374
-329
lines changed

28 files changed

+1374
-329
lines changed

chaos/db.ts

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import type { ChaosProvider } from "@chaos/provider";
2+
import type { WorkerManager } from "@workers/manager";
3+
4+
export type DbChaosConfig = {
5+
minLockTime: number; // Minimum duration in milliseconds to lock the database
6+
maxLockTime: number; // Maximum duration in milliseconds to lock the database
7+
lockInterval: number; // Interval in milliseconds between lock attempts
8+
impactedWorkerPercentage: number; // number between 0 and 100 for what % of workers to lock on each run
9+
};
10+
11+
export class DbChaos implements ChaosProvider {
12+
config: DbChaosConfig;
13+
activeLocks = new Map<string, Promise<void>>();
14+
interval?: NodeJS.Timeout;
15+
16+
constructor(config: DbChaosConfig) {
17+
validateConfig(config);
18+
this.config = config;
19+
}
20+
21+
start(workers: WorkerManager): Promise<void> {
22+
const { minLockTime, maxLockTime, lockInterval, impactedWorkerPercentage } =
23+
this.config;
24+
console.log(
25+
`Starting DB Chaos:
26+
Locking for ${minLockTime}ms - ${maxLockTime}ms
27+
Interval: ${lockInterval}ms`,
28+
);
29+
this.interval = setInterval(() => {
30+
for (const worker of workers.getAll()) {
31+
if (Math.random() * 100 > impactedWorkerPercentage) {
32+
continue;
33+
}
34+
const duration = Math.floor(
35+
minLockTime + Math.random() * (maxLockTime - minLockTime),
36+
);
37+
38+
const lockKey = `${worker.name}-${worker.installationId}`;
39+
40+
// Only lock if not already locked
41+
if (!this.activeLocks.has(lockKey)) {
42+
console.log(
43+
`[db-chaos] Locking ${worker.name} database for ${duration}ms`,
44+
);
45+
46+
// Call the lockDB method on the worker and track it
47+
const lockPromise = worker.worker
48+
.lockDB(duration)
49+
.catch((err: unknown) => {
50+
console.warn(err);
51+
})
52+
.finally(() => {
53+
this.activeLocks.delete(lockKey);
54+
});
55+
56+
this.activeLocks.set(lockKey, lockPromise);
57+
}
58+
}
59+
}, lockInterval);
60+
61+
return Promise.resolve();
62+
}
63+
64+
async stop() {
65+
console.log("Stopping DB Chaos");
66+
if (this.interval) {
67+
clearInterval(this.interval);
68+
}
69+
70+
// Wait for all the existing locks to complete
71+
await Promise.allSettled(Array.from(this.activeLocks.values()));
72+
}
73+
}
74+
75+
function validateConfig(config: DbChaosConfig): void {
76+
if (config.minLockTime > config.maxLockTime) {
77+
throw new Error(
78+
"Minimum lock time cannot be greater than maximum lock time",
79+
);
80+
}
81+
82+
if (
83+
config.impactedWorkerPercentage < 0 ||
84+
config.impactedWorkerPercentage > 100
85+
) {
86+
throw new Error("Impacted worker percentage must be between 0 and 100");
87+
}
88+
89+
if (!config.lockInterval) {
90+
throw new Error("Lock interval must be defined");
91+
}
92+
93+
if (config.impactedWorkerPercentage === undefined) {
94+
throw new Error("Impacted worker percentage must be defined");
95+
}
96+
}

chaos/network.ts

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import type { ChaosProvider } from "@chaos/provider";
2+
import type { DockerContainer } from "network-stability/container";
3+
4+
export type NetworkChaosConfig = {
5+
delayMin: number; // Minimum delay in ms
6+
delayMax: number; // Maximum delay in ms
7+
jitterMin: number; // Minimum jitter in ms
8+
jitterMax: number; // Maximum jitter in ms
9+
lossMin: number; // Minimum packet loss percentage (0-100)
10+
lossMax: number; // Maximum packet loss percentage (0-100)
11+
interval: number; // How often to apply chaos in ms
12+
};
13+
14+
export class NetworkChaos implements ChaosProvider {
15+
config: NetworkChaosConfig;
16+
interval?: NodeJS.Timeout;
17+
nodes: DockerContainer[];
18+
19+
constructor(config: NetworkChaosConfig, nodes: DockerContainer[]) {
20+
this.config = config;
21+
this.nodes = nodes;
22+
}
23+
24+
start(): Promise<void> {
25+
console.log(`Starting network chaos:
26+
Nodes: ${this.nodes.map((node) => node.name).join(", ")}
27+
Delay: ${this.config.delayMin}ms - ${this.config.delayMax}ms
28+
Jitter: ${this.config.jitterMin}ms - ${this.config.jitterMax}ms
29+
Loss: ${this.config.lossMin}% - ${this.config.lossMax}%
30+
Interval: ${this.config.interval}ms`);
31+
32+
validateContainers(this.nodes);
33+
this.clearAll();
34+
35+
this.interval = setInterval(() => {
36+
for (const node of this.nodes) {
37+
this.applyToNode(node);
38+
}
39+
}, this.config.interval);
40+
41+
return Promise.resolve();
42+
}
43+
44+
private applyToNode(node: DockerContainer) {
45+
const { delayMin, delayMax, jitterMin, jitterMax, lossMin, lossMax } =
46+
this.config;
47+
const delay = Math.floor(delayMin + Math.random() * (delayMax - delayMin));
48+
const jitter = Math.floor(
49+
jitterMin + Math.random() * (jitterMax - jitterMin),
50+
);
51+
const loss = lossMin + Math.random() * (lossMax - lossMin);
52+
53+
try {
54+
node.addJitter(delay, jitter);
55+
node.addLoss(loss);
56+
} catch (err) {
57+
console.warn(`[chaos] Error applying netem on ${node.name}:`, err);
58+
}
59+
}
60+
61+
clearAll() {
62+
for (const node of this.nodes) {
63+
try {
64+
node.clearLatency();
65+
} catch (err) {
66+
console.warn(`[chaos] Error clearing latency on ${node.name}:`, err);
67+
}
68+
}
69+
}
70+
71+
stop(): Promise<void> {
72+
if (this.interval) {
73+
clearInterval(this.interval);
74+
}
75+
76+
this.clearAll();
77+
78+
return Promise.resolve();
79+
}
80+
}
81+
82+
const validateContainers = (allNodes: DockerContainer[]) => {
83+
for (const node of allNodes) {
84+
try {
85+
// Test if container exists by trying to get its IP
86+
if (!node.ip || !node.veth) {
87+
throw new Error(`Container ${node.name} has no IP address`);
88+
}
89+
} catch {
90+
throw new Error(
91+
`Docker container ${node.name} is not running. Network chaos requires local multinode setup (./dev/up).`,
92+
);
93+
}
94+
}
95+
};

chaos/provider.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
import { type WorkerManager } from "@workers/manager";
2+
3+
// Generic interface for the Chaos Provider.
4+
// A Chaos Provider is started after the test has been setup, but before we start performing actions
5+
// It is stopped after the core of the test has been completed, and should remove all chaos so that final
6+
// validations can be performed cleanly.
7+
export interface ChaosProvider {
8+
start(workers: WorkerManager): Promise<void>;
9+
stop(): Promise<void>;
10+
}

chaos/streams.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import type { ChaosProvider } from "@chaos/provider";
2+
import { typeofStream, type WorkerClient } from "@workers/main";
3+
import type { WorkerManager } from "@workers/manager";
4+
5+
export type StreamsConfig = {
6+
cloned: boolean; // Should the stream be run against the workers used in the tests, or a cloned client instance?
7+
};
8+
9+
export class StreamsChaos implements ChaosProvider {
10+
workers?: WorkerClient[];
11+
config: StreamsConfig;
12+
13+
constructor(config: StreamsConfig) {
14+
this.config = config;
15+
}
16+
17+
async start(workers: WorkerManager) {
18+
console.log("Starting StreamsChaos");
19+
let allWorkers = workers.getAll().map((w) => w.worker);
20+
if (this.config.cloned) {
21+
allWorkers = await Promise.all(allWorkers.map((w) => w.clone()));
22+
}
23+
24+
this.workers = allWorkers;
25+
for (const worker of allWorkers) {
26+
worker.startStream(typeofStream.Message);
27+
}
28+
29+
return Promise.resolve();
30+
}
31+
32+
stop() {
33+
if (this.workers) {
34+
for (const worker of this.workers) {
35+
worker.stopStreams();
36+
}
37+
}
38+
39+
return Promise.resolve();
40+
}
41+
}

cli/gen.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,9 @@ EXAMPLES:
6262
yarn gen --help
6363
6464
PRESET COMMANDS:
65-
yarn update:local Update 500 inboxes for local testing
66-
yarn update:prod Update inboxes for production testing
67-
yarn restart:prod Restart production installations (force recreate)
65+
yarn gen update:local Update 500 inboxes for local testing
66+
yarn gen update:prod Update inboxes for production testing
67+
yarn gen restart:prod Restart production installations (force recreate)
6868
6969
For more information, see: cli/readme.md
7070
`);

0 commit comments

Comments
 (0)