11import { chunk , db , isStaging , sleep , hashStringToUuid } from '#src/utils' ;
2+ import * as cheerio from 'cheerio' ;
23
34export const processScans = async ( event ) => {
45 console . log ( `START PROCESS SCANS` ) ;
@@ -95,13 +96,16 @@ export const processScans = async (event) => {
9596 const equalifiedNodeIds = ( await db . query ( {
9697 text : `SELECT "id" FROM "enodes" WHERE "equalified"=$1 AND "user_id"=$2 AND "url_id"=ANY($3)` ,
9798 values : [ false , userId , allPropertyUrlIds ] ,
98- } ) ) . rows . filter ( obj => ! [ ...allNodeIds , ...failedNodeIds ] . map ( obj => obj . id ) . includes ( obj . id ) ) . map ( obj => obj . id ) ;
99+ } ) ) . rows . map ( obj => obj . id ) . filter ( obj => ! [ ...allNodeIds , ...failedNodeIds ] . map ( obj => obj ) . includes ( obj ) ) ;
100+
101+ console . log ( JSON . stringify ( { allPropertyUrlIds, equalifiedNodeIds, allNodeIds, failedNodeIds } ) ) ;
99102
100103 for ( const equalifiedNodeId of equalifiedNodeIds ) {
101104 const existingNodeUpdateId = ( await db . query ( {
102105 text : `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3` ,
103106 values : [ userId , equalifiedNodeId , `${ new Date ( ) . toISOString ( ) . split ( 'T' ) [ 0 ] } %` ] ,
104107 } ) ) . rows [ 0 ] ?. id ;
108+ console . log ( JSON . stringify ( { equalifiedNodeId } ) ) ;
105109 if ( existingNodeUpdateId ) {
106110 // We found an existing node update for today, let's simply update it
107111 await db . query ( {
@@ -115,12 +119,14 @@ export const processScans = async (event) => {
115119 text : `INSERT INTO "enode_updates" ("user_id", "enode_id", "equalified") VALUES ($1, $2, $3)` ,
116120 values : [ userId , equalifiedNodeId , true ] ,
117121 } ) ;
122+ console . log ( JSON . stringify ( { message : 'Inserted equalified node update' } ) ) ;
118123 }
119124 // Now that we've inserted an "equalified" node update, let's set the parent node to "equalified" too!
120125 await db . query ( {
121126 text : `UPDATE "enodes" SET "equalified"=$1 WHERE "id"=$2` ,
122127 values : [ true , equalifiedNodeId ] ,
123128 } ) ;
129+ console . log ( JSON . stringify ( { message : 'Update equalified node' } ) ) ;
124130 }
125131
126132 // For our failed nodes, we need to "copy" the last node update that exists (if there even is one!)
@@ -129,6 +135,7 @@ export const processScans = async (event) => {
129135 text : `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3` ,
130136 values : [ userId , failedNodeId , `${ new Date ( ) . toISOString ( ) . split ( 'T' ) [ 0 ] } %` ] ,
131137 } ) ) . rows [ 0 ] ?. id ;
138+ console . log ( JSON . stringify ( { existingNodeUpdateId } ) )
132139 if ( existingNodeUpdateId ) {
133140 await db . query ( {
134141 text : `UPDATE "enode_updates" SET "equalified"=$1 WHERE "id"=$2` ,
@@ -141,12 +148,14 @@ export const processScans = async (event) => {
141148 text : `INSERT INTO "enode_updates" ("user_id", "enode_id", "equalified") VALUES ($1, $2, $3)` ,
142149 values : [ userId , failedNodeId , false ] ,
143150 } ) ;
151+ console . log ( JSON . stringify ( { message : 'no node update found!' } ) )
144152 }
145153 // Now that we've inserted an "unequalified" node update, let's set the parent node to "unequalified" too!
146154 await db . query ( {
147155 text : `UPDATE "enodes" SET "equalified"=$1 WHERE "id"=$2` ,
148156 values : [ false , failedNodeId ] ,
149157 } ) ;
158+ console . log ( JSON . stringify ( { message : 'updating parent node!' } ) )
150159 }
151160
152161 await db . clean ( ) ;
@@ -170,17 +179,21 @@ const scanProcessor = async ({ result, jobId, userId, propertyId }) => {
170179 } ) ) . rows ?. [ 0 ] ?. id ;
171180 }
172181 for ( const row of result . nodes ) {
182+ const normalizedHtml = normalizeHtmlWithVdom ( row . html ) ;
183+ const htmlHashId = hashStringToUuid ( normalizedHtml ) ;
173184 const existingId = ( await db . query ( {
174- text : `SELECT "id" FROM "enodes" WHERE "user_id"=$1 AND "html "=$2 AND "targets"=$3 AND " url_id"=$4 ` ,
175- values : [ userId , row . html , JSON . stringify ( row . targets ) , result . urls . find ( obj => obj . urlId === row . relatedUrlId ) ?. id ] ,
185+ text : `SELECT "id" FROM "enodes" WHERE "user_id"=$1 AND "html_hash_id "=$2 AND "url_id"=$3 ` ,
186+ values : [ userId , htmlHashId , result . urls . find ( obj => obj . urlId === row . relatedUrlId ) ?. id ] ,
176187 } ) ) . rows ?. [ 0 ] ?. id ;
177188
178189 row . id = existingId ??
179190 ( await db . query ( {
180- text : `INSERT INTO "enodes" ("user_id", "html", "targets ", "url_id", "equalified") VALUES ($1, $2, $3, $4, $5) RETURNING "id"` ,
181- values : [ userId , row . html , JSON . stringify ( row . targets ) , result . urls . find ( obj => obj . urlId === row . relatedUrlId ) ?. id , false ] ,
191+ text : `INSERT INTO "enodes" ("user_id", "targets", " html", "html_normalized ", "html_hash_id", " url_id", "equalified") VALUES ($1, $2, $3, $4, $5, $6, $7 ) RETURNING "id"` ,
192+ values : [ userId , JSON . stringify ( row . targets ) , row . html , normalizedHtml , htmlHashId , result . urls . find ( obj => obj . urlId === row . relatedUrlId ) ?. id , false ] ,
182193 } ) ) . rows ?. [ 0 ] ?. id ;
183194
195+ // We used to compare by targets as well, by something in the scan ocassionally returns different targets!!
196+
184197 const existingNodeUpdateId = ( await db . query ( {
185198 text : `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3` ,
186199 values : [ userId , row . id , `${ new Date ( ) . toISOString ( ) . split ( 'T' ) [ 0 ] } %` ] ,
@@ -269,4 +282,75 @@ const scanProcessor = async ({ result, jobId, userId, propertyId }) => {
269282 } ) ;
270283
271284 return result . nodes . map ( obj => obj . id ) ;
272- }
285+ }
286+
287+ const normalizeHtmlWithRegex = ( html ) => {
288+ if ( ! html ) return html ;
289+
290+ // Remove query parameters from src and href attributes
291+ html = html . replace ( / ( s r c | h r e f ) = ( [ " ' ] ) ( [ ^ ? " ' ] * ) \? [ ^ " ' ] * ?\2/ g, '$1=$2$3$2' ) ;
292+
293+ // Normalize data-version paths with version hashes
294+ html = html . replace ( / d a t a - v e r s i o n = ( [ " ' ] ) ( \/ s \/ p l a y e r \/ ) [ a - z A - Z 0 - 9 ] { 8 , } ( \/ .* ?\2) / g, 'data-version=$1$2NORMALIZED$3' ) ;
295+
296+ // Normalize tabindex attributes
297+ html = html . replace ( / t a b i n d e x = [ " ' ] - ? \d + [ " ' ] / g, 'tabindex="NORMALIZED"' ) ;
298+
299+ return html ;
300+ } ;
301+
302+ const normalizeHtmlWithVdom = ( html ) => {
303+ if ( ! html ) return '' ;
304+
305+ const $ = cheerio . load ( `<div id="wrapper">${ html } </div>` ) ;
306+ const root = $ ( '#wrapper' ) ;
307+
308+ // Process all elements
309+ root . find ( '*' ) . each ( function ( ) {
310+ const el = $ ( this ) ;
311+
312+ // Normalize IDs with numbers
313+ if ( el . attr ( 'id' ) && / \d { 4 , } / . test ( el . attr ( 'id' ) ) ) {
314+ el . attr ( 'id' , 'NORMALIZED' ) ;
315+ }
316+
317+ // Always normalize tabindex
318+ if ( el . attr ( 'tabindex' ) ) {
319+ el . attr ( 'tabindex' , 'NORMALIZED' ) ;
320+ }
321+
322+ // Remove query params from URLs
323+ [ 'src' , 'href' ] . forEach ( attr => {
324+ if ( el . attr ( attr ) && el . attr ( attr ) . includes ( '?' ) ) {
325+ el . attr ( attr , el . attr ( attr ) . split ( '?' ) [ 0 ] ) ;
326+ }
327+ } ) ;
328+
329+ // Handle h5p quiz elements
330+ if ( el . hasClass ( 'h5p-sc-alternative' ) ) {
331+ if ( el . hasClass ( 'h5p-sc-is-correct' ) || el . hasClass ( 'h5p-sc-is-wrong' ) ) {
332+ el . removeClass ( 'h5p-sc-is-correct h5p-sc-is-wrong' )
333+ . addClass ( 'h5p-sc-is-NORMALIZED' ) ;
334+ }
335+ }
336+
337+ // Normalize data-version attributes
338+ if ( el . attr ( 'data-version' ) && el . attr ( 'data-version' ) . includes ( '/s/player/' ) ) {
339+ el . attr ( 'data-version' , el . attr ( 'data-version' )
340+ . replace ( / \/ s \/ p l a y e r \/ [ a - z A - Z 0 - 9 ] { 8 , } \/ / , '/s/player/NORMALIZED/' ) ) ;
341+ }
342+
343+ // Add more element-specific normalizations based on your data patterns
344+ } ) ;
345+
346+ // Remove all whitespace between tags for more reliable comparison
347+ let result = root . html ( ) ;
348+
349+ // Remove excess whitespace for more consistent matching
350+ result = result . replace ( / > \s + < / g, '><' ) ;
351+
352+ // Remove all text node whitespace variations
353+ result = result . replace ( / \s { 2 , } / g, ' ' ) ;
354+
355+ return result ;
356+ }
0 commit comments