Skip to content

Commit 3dba2e6

Browse files
Now using vdom to normalize HTML & compare hashes, add CSV/admin routes
1 parent 8f3a50f commit 3dba2e6

File tree

9 files changed

+443
-12
lines changed

9 files changed

+443
-12
lines changed

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"@fastify/aws-lambda": "^4.1.0",
2222
"@types/node": "^20.12.7",
2323
"aws-jwt-verify": "^4.0.1",
24+
"cheerio": "^1.0.0",
2425
"esbuild": "^0.21.5",
2526
"fastify": "^4.26.2",
2627
"nodemon": "^3.1.0",

src/app.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import Fastify from 'fastify';
2-
import { addProperties, addReports, addResults, addScans, deleteProperties, deleteReports, deleteUser, getApikey, getCharts, getFilters, getProperties, getReports, getResultsAll, getResultsMessages, getResultsSchema, getResultsTags, getResultsUrls, getScans, getUpdates, help, trackUser, updateProperties, updateReports } from '#src/routes';
2+
import { addProperties, addReports, addResults, addScans, adminClearCache, adminProcessScans, deleteProperties, deleteReports, deleteUser, getApikey, getCharts, getFilters, getProperties, getReports, getResultsAll, getResultsCsv, getResultsMessages, getResultsSchema, getResultsTags, getResultsUrls, getScans, getUpdates, help, trackUser, updateProperties, updateReports } from '#src/routes';
33
import { CognitoJwtVerifier } from 'aws-jwt-verify';
44
import { db } from './utils';
55
import { getScan } from './routes/getScan';
@@ -13,9 +13,10 @@ export const jwtClaims = { sub: null };
1313

1414
fastify.addHook('preHandler', async (request, reply) => {
1515
try {
16-
if (request.headers.apikey) {
16+
const apikey = request.headers.apikey ?? request.query.apikey;
17+
if (apikey) {
1718
await db.connect();
18-
const userId = (await db.query(`SELECT "id" FROM "users" WHERE "apikey"=$1`, [request.headers.apikey])).rows[0].id;
19+
const userId = (await db.query(`SELECT "id" FROM "users" WHERE "apikey"=$1`, [apikey])).rows[0].id;
1920
await db.clean();
2021
request.headers['x-hasura-user-id'] = userId;
2122
request.headers['x-hasura-role'] = 'user';
@@ -35,6 +36,7 @@ fastify.addHook('preHandler', async (request, reply) => {
3536
fastify.get('/get/results', {}, async (request, reply) => getResultsAll({ request, reply }));
3637
fastify.get('/get/results/schema', {}, async (request, reply) => getResultsSchema({ request, reply }));
3738
fastify.get('/get/results/all', {}, async (request, reply) => getResultsAll({ request, reply }));
39+
fastify.get('/get/results/csv', {}, async (request, reply) => getResultsCsv({ request, reply }));
3840
fastify.get('/get/results/messages', {}, async (request, reply) => getResultsMessages({ request, reply }));
3941
fastify.get('/get/results/tags', {}, async (request, reply) => getResultsTags({ request, reply }));
4042
fastify.get('/get/results/urls', {}, async (request, reply) => getResultsUrls({ request, reply }));
@@ -65,6 +67,8 @@ fastify.delete('/delete/user', {}, async (request, reply) => deleteUser({ reques
6567
// MISC requests
6668
fastify.post('/help', {}, async (request, reply) => help({ request, reply }));
6769
fastify.post('/track/user', {}, async (request, reply) => trackUser({ request, reply }));
70+
fastify.get('/admin/clear-cache', {}, async (request, reply) => adminClearCache({ request, reply }));
71+
fastify.get('/admin/process-scans', {}, async (request, reply) => adminProcessScans({ request, reply }));
6872

6973
fastify.listen({ port: 3000 }, (err) => {
7074
console.log(`Server listening on ${fastify.server.address().port}`)

src/internal/processScans.ts

Lines changed: 90 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { chunk, db, isStaging, sleep, hashStringToUuid } from '#src/utils';
2+
import * as cheerio from 'cheerio';
23

34
export const processScans = async (event) => {
45
console.log(`START PROCESS SCANS`);
@@ -95,13 +96,16 @@ export const processScans = async (event) => {
9596
const equalifiedNodeIds = (await db.query({
9697
text: `SELECT "id" FROM "enodes" WHERE "equalified"=$1 AND "user_id"=$2 AND "url_id"=ANY($3)`,
9798
values: [false, userId, allPropertyUrlIds],
98-
})).rows.filter(obj => ![...allNodeIds, ...failedNodeIds].map(obj => obj.id).includes(obj.id)).map(obj => obj.id);
99+
})).rows.map(obj => obj.id).filter(obj => ![...allNodeIds, ...failedNodeIds].map(obj => obj).includes(obj));
100+
101+
console.log(JSON.stringify({ allPropertyUrlIds, equalifiedNodeIds, allNodeIds, failedNodeIds }));
99102

100103
for (const equalifiedNodeId of equalifiedNodeIds) {
101104
const existingNodeUpdateId = (await db.query({
102105
text: `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3`,
103106
values: [userId, equalifiedNodeId, `${new Date().toISOString().split('T')[0]}%`],
104107
})).rows[0]?.id;
108+
console.log(JSON.stringify({ equalifiedNodeId }));
105109
if (existingNodeUpdateId) {
106110
// We found an existing node update for today, let's simply update it
107111
await db.query({
@@ -115,12 +119,14 @@ export const processScans = async (event) => {
115119
text: `INSERT INTO "enode_updates" ("user_id", "enode_id", "equalified") VALUES ($1, $2, $3)`,
116120
values: [userId, equalifiedNodeId, true],
117121
});
122+
console.log(JSON.stringify({ message: 'Inserted equalified node update' }));
118123
}
119124
// Now that we've inserted an "equalified" node update, let's set the parent node to "equalified" too!
120125
await db.query({
121126
text: `UPDATE "enodes" SET "equalified"=$1 WHERE "id"=$2`,
122127
values: [true, equalifiedNodeId],
123128
});
129+
console.log(JSON.stringify({ message: 'Update equalified node' }));
124130
}
125131

126132
// For our failed nodes, we need to "copy" the last node update that exists (if there even is one!)
@@ -129,6 +135,7 @@ export const processScans = async (event) => {
129135
text: `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3`,
130136
values: [userId, failedNodeId, `${new Date().toISOString().split('T')[0]}%`],
131137
})).rows[0]?.id;
138+
console.log(JSON.stringify({ existingNodeUpdateId }))
132139
if (existingNodeUpdateId) {
133140
await db.query({
134141
text: `UPDATE "enode_updates" SET "equalified"=$1 WHERE "id"=$2`,
@@ -141,12 +148,14 @@ export const processScans = async (event) => {
141148
text: `INSERT INTO "enode_updates" ("user_id", "enode_id", "equalified") VALUES ($1, $2, $3)`,
142149
values: [userId, failedNodeId, false],
143150
});
151+
console.log(JSON.stringify({ message: 'no node update found!' }))
144152
}
145153
// Now that we've inserted an "unequalified" node update, let's set the parent node to "unequalified" too!
146154
await db.query({
147155
text: `UPDATE "enodes" SET "equalified"=$1 WHERE "id"=$2`,
148156
values: [false, failedNodeId],
149157
});
158+
console.log(JSON.stringify({ message: 'updating parent node!' }))
150159
}
151160

152161
await db.clean();
@@ -170,17 +179,21 @@ const scanProcessor = async ({ result, jobId, userId, propertyId }) => {
170179
})).rows?.[0]?.id;
171180
}
172181
for (const row of result.nodes) {
182+
const normalizedHtml = normalizeHtmlWithVdom(row.html);
183+
const htmlHashId = hashStringToUuid(normalizedHtml);
173184
const existingId = (await db.query({
174-
text: `SELECT "id" FROM "enodes" WHERE "user_id"=$1 AND "html"=$2 AND "targets"=$3 AND "url_id"=$4`,
175-
values: [userId, row.html, JSON.stringify(row.targets), result.urls.find(obj => obj.urlId === row.relatedUrlId)?.id],
185+
text: `SELECT "id" FROM "enodes" WHERE "user_id"=$1 AND "html_hash_id"=$2 AND "url_id"=$3`,
186+
values: [userId, htmlHashId, result.urls.find(obj => obj.urlId === row.relatedUrlId)?.id],
176187
})).rows?.[0]?.id;
177188

178189
row.id = existingId ??
179190
(await db.query({
180-
text: `INSERT INTO "enodes" ("user_id", "html", "targets", "url_id", "equalified") VALUES ($1, $2, $3, $4, $5) RETURNING "id"`,
181-
values: [userId, row.html, JSON.stringify(row.targets), result.urls.find(obj => obj.urlId === row.relatedUrlId)?.id, false],
191+
text: `INSERT INTO "enodes" ("user_id", "targets", "html", "html_normalized", "html_hash_id", "url_id", "equalified") VALUES ($1, $2, $3, $4, $5, $6, $7) RETURNING "id"`,
192+
values: [userId, JSON.stringify(row.targets), row.html, normalizedHtml, htmlHashId, result.urls.find(obj => obj.urlId === row.relatedUrlId)?.id, false],
182193
})).rows?.[0]?.id;
183194

195+
// We used to compare by targets as well, by something in the scan ocassionally returns different targets!!
196+
184197
const existingNodeUpdateId = (await db.query({
185198
text: `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3`,
186199
values: [userId, row.id, `${new Date().toISOString().split('T')[0]}%`],
@@ -269,4 +282,75 @@ const scanProcessor = async ({ result, jobId, userId, propertyId }) => {
269282
});
270283

271284
return result.nodes.map(obj => obj.id);
272-
}
285+
}
286+
287+
const normalizeHtmlWithRegex = (html) => {
288+
if (!html) return html;
289+
290+
// Remove query parameters from src and href attributes
291+
html = html.replace(/(src|href)=(["'])([^?"']*)\?[^"']*?\2/g, '$1=$2$3$2');
292+
293+
// Normalize data-version paths with version hashes
294+
html = html.replace(/data-version=(["'])(\/s\/player\/)[a-zA-Z0-9]{8,}(\/.*?\2)/g, 'data-version=$1$2NORMALIZED$3');
295+
296+
// Normalize tabindex attributes
297+
html = html.replace(/tabindex=["']-?\d+["']/g, 'tabindex="NORMALIZED"');
298+
299+
return html;
300+
};
301+
302+
const normalizeHtmlWithVdom = (html) => {
303+
if (!html) return '';
304+
305+
const $ = cheerio.load(`<div id="wrapper">${html}</div>`);
306+
const root = $('#wrapper');
307+
308+
// Process all elements
309+
root.find('*').each(function () {
310+
const el = $(this);
311+
312+
// Normalize IDs with numbers
313+
if (el.attr('id') && /\d{4,}/.test(el.attr('id'))) {
314+
el.attr('id', 'NORMALIZED');
315+
}
316+
317+
// Always normalize tabindex
318+
if (el.attr('tabindex')) {
319+
el.attr('tabindex', 'NORMALIZED');
320+
}
321+
322+
// Remove query params from URLs
323+
['src', 'href'].forEach(attr => {
324+
if (el.attr(attr) && el.attr(attr).includes('?')) {
325+
el.attr(attr, el.attr(attr).split('?')[0]);
326+
}
327+
});
328+
329+
// Handle h5p quiz elements
330+
if (el.hasClass('h5p-sc-alternative')) {
331+
if (el.hasClass('h5p-sc-is-correct') || el.hasClass('h5p-sc-is-wrong')) {
332+
el.removeClass('h5p-sc-is-correct h5p-sc-is-wrong')
333+
.addClass('h5p-sc-is-NORMALIZED');
334+
}
335+
}
336+
337+
// Normalize data-version attributes
338+
if (el.attr('data-version') && el.attr('data-version').includes('/s/player/')) {
339+
el.attr('data-version', el.attr('data-version')
340+
.replace(/\/s\/player\/[a-zA-Z0-9]{8,}\//, '/s/player/NORMALIZED/'));
341+
}
342+
343+
// Add more element-specific normalizations based on your data patterns
344+
});
345+
346+
// Remove all whitespace between tags for more reliable comparison
347+
let result = root.html();
348+
349+
// Remove excess whitespace for more consistent matching
350+
result = result.replace(/>\s+</g, '><');
351+
352+
// Remove all text node whitespace variations
353+
result = result.replace(/\s{2,}/g, ' ');
354+
355+
return result;
356+
}

src/routes/adminClearCache.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import { jwtClaims } from '#src/app';
2+
import { db } from '#src/utils';
3+
4+
export const adminClearCache = async ({ request, reply }) => {
5+
const adminIds = JSON.parse(process.env.ADMIN_IDS);
6+
if (adminIds.includes(jwtClaims.sub)) {
7+
await db.connect();
8+
await db.query({
9+
text: `UPDATE "reports" SET "cache_date"=$1 WHERE "user_id"=$2`,
10+
values: ['2025-01-01', jwtClaims.sub],
11+
})
12+
await db.clean();
13+
return { success: true };
14+
}
15+
else {
16+
return { success: false };
17+
}
18+
}

src/routes/adminProcessScans.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { jwtClaims } from '#src/app';
2+
import { runEveryFifteenMinutes } from '#src/scheduled/runEveryFifteenMinutes';
3+
4+
export const adminProcessScans = async ({ request, reply }) => {
5+
const adminIds = JSON.parse(process.env.ADMIN_IDS);
6+
if (adminIds.includes(jwtClaims.sub)) {
7+
await runEveryFifteenMinutes();
8+
return { success: true };
9+
}
10+
else {
11+
return { success: false };
12+
}
13+
}

src/routes/getApikey.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@ import { db, formatId } from '#src/utils';
33

44
export const getApikey = async ({ request, reply }) => {
55
await db.connect();
6-
const apikey = (await db.query(`SELECT "apikey" FROM "users" WHERE "id" = $1`, [jwtClaims.sub])).rows?.[0]?.apikey;
6+
const { id, apikey } = (await db.query(`SELECT "id", "apikey" FROM "users" WHERE "id" = $1`, [jwtClaims.sub])).rows?.[0];
77
await db.clean();
8-
return { apikey: formatId(apikey) };
8+
const adminIds = JSON.parse(process.env.ADMIN_IDS);
9+
return { apikey: formatId(apikey), isAdmin: adminIds.includes(id) };
910
}

0 commit comments

Comments
 (0)