1- import { chunk , db , isStaging , sleep , hashStringToUuid } from '#src/utils' ;
2- import * as cheerio from 'cheerio' ;
1+ import { chunk , db , isStaging , sleep , hashStringToUuid , normalizeHtmlWithVdom } from '#src/utils' ;
32
43export const processScans = async ( event ) => {
5- console . log ( `START PROCESS SCANS ` ) ;
4+ console . log ( `Start processScans ` ) ;
65 const startTime = new Date ( ) . getTime ( ) ;
76 await db . connect ( ) ;
87 const { userId, propertyId, discovery } = event ;
@@ -16,13 +15,13 @@ export const processScans = async (event) => {
1615 } ) ) . rows . map ( obj => obj . job_id ) ;
1716 const allNodeIds = [ ] ;
1817 const failedNodeIds = [ ] ;
19- const pollScans = ( givenJobIds ) => new Promise ( async ( finalRes ) => {
18+ const pollScans = ( givenJobIds ) => new Promise ( async ( outerRes ) => {
2019 await sleep ( 1000 ) ;
2120 const remainingScans = [ ] ;
2221 const batchesOfJobIds = chunk ( givenJobIds , 25 ) ;
2322 for ( const [ index , batchOfJobIds ] of batchesOfJobIds . entries ( ) ) {
24- console . log ( `Start ${ index } of ${ batchesOfJobIds . length } batches` ) ;
25- await Promise . allSettled ( batchOfJobIds . map ( jobId => new Promise ( async ( res ) => {
23+ console . log ( `Start ${ index + 1 } of ${ batchesOfJobIds . length } batches` ) ;
24+ await Promise . allSettled ( batchOfJobIds . map ( jobId => new Promise ( async ( innerRes ) => {
2625 try {
2726 const scanResults = await fetch ( `https://scan${ isStaging ? '-dev' : '' } .equalify.app/results/${ jobId } ` , { signal : AbortSignal . timeout ( 10000 ) } ) ;
2827 const { result, status } = await scanResults . json ( ) ;
@@ -57,8 +56,9 @@ export const processScans = async (event) => {
5756 console . log ( err ) ;
5857 remainingScans . push ( jobId ) ;
5958 }
60- res ( 1 ) ;
59+ innerRes ( 1 ) ;
6160 } ) ) ) ;
61+ console . log ( `End ${ index + 1 } of ${ batchesOfJobIds . length } batches` ) ;
6262 }
6363 const stats = { userId, remainingScans : remainingScans . length } ;
6464 console . log ( JSON . stringify ( stats ) ) ;
@@ -68,6 +68,7 @@ export const processScans = async (event) => {
6868 const tenMinutes = 10 * 60 * 1000 ;
6969 if ( deltaTime <= tenMinutes ) {
7070 await pollScans ( remainingScans ) ;
71+ outerRes ( 1 ) ;
7172 }
7273 else if ( deltaTime > tenMinutes ) {
7374 const scansExist = ( await db . query ( {
@@ -81,12 +82,13 @@ export const processScans = async (event) => {
8182 }
8283 }
8384 }
84- else {
85- finalRes ( 1 ) ;
86- }
85+ outerRes ( 1 ) ;
8786 } ) ;
87+ console . log ( `Start pollScans` ) ;
8888 await pollScans ( jobIds ) ;
89+ console . log ( `End pollScans` ) ;
8990
91+ console . log ( `Start equalification` ) ;
9092 // At the end of all scans, reconcile equalified nodes
9193 // Set node equalified to true for previous nodes associated w/ this scan (EXCEPT failed ones)
9294 const allPropertyUrlIds = ( await db . query ( {
@@ -105,7 +107,6 @@ export const processScans = async (event) => {
105107 text : `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3` ,
106108 values : [ userId , equalifiedNodeId , `${ new Date ( ) . toISOString ( ) . split ( 'T' ) [ 0 ] } %` ] ,
107109 } ) ) . rows [ 0 ] ?. id ;
108- console . log ( JSON . stringify ( { equalifiedNodeId } ) ) ;
109110 if ( existingNodeUpdateId ) {
110111 // We found an existing node update for today, let's simply update it
111112 await db . query ( {
@@ -119,14 +120,12 @@ export const processScans = async (event) => {
119120 text : `INSERT INTO "enode_updates" ("user_id", "enode_id", "equalified") VALUES ($1, $2, $3)` ,
120121 values : [ userId , equalifiedNodeId , true ] ,
121122 } ) ;
122- console . log ( JSON . stringify ( { message : 'Inserted equalified node update' } ) ) ;
123123 }
124124 // Now that we've inserted an "equalified" node update, let's set the parent node to "equalified" too!
125125 await db . query ( {
126126 text : `UPDATE "enodes" SET "equalified"=$1 WHERE "id"=$2` ,
127127 values : [ true , equalifiedNodeId ] ,
128128 } ) ;
129- console . log ( JSON . stringify ( { message : 'Update equalified node' } ) ) ;
130129 }
131130
132131 // For our failed nodes, we need to "copy" the last node update that exists (if there even is one!)
@@ -135,7 +134,6 @@ export const processScans = async (event) => {
135134 text : `SELECT "id" FROM "enode_updates" WHERE "user_id"=$1 AND "enode_id"=$2 AND "created_at"::text LIKE $3` ,
136135 values : [ userId , failedNodeId , `${ new Date ( ) . toISOString ( ) . split ( 'T' ) [ 0 ] } %` ] ,
137136 } ) ) . rows [ 0 ] ?. id ;
138- console . log ( JSON . stringify ( { existingNodeUpdateId } ) )
139137 if ( existingNodeUpdateId ) {
140138 await db . query ( {
141139 text : `UPDATE "enode_updates" SET "equalified"=$1 WHERE "id"=$2` ,
@@ -148,18 +146,18 @@ export const processScans = async (event) => {
148146 text : `INSERT INTO "enode_updates" ("user_id", "enode_id", "equalified") VALUES ($1, $2, $3)` ,
149147 values : [ userId , failedNodeId , false ] ,
150148 } ) ;
151- console . log ( JSON . stringify ( { message : 'no node update found!' } ) )
152149 }
153150 // Now that we've inserted an "unequalified" node update, let's set the parent node to "unequalified" too!
154151 await db . query ( {
155152 text : `UPDATE "enodes" SET "equalified"=$1 WHERE "id"=$2` ,
156153 values : [ false , failedNodeId ] ,
157154 } ) ;
158- console . log ( JSON . stringify ( { message : 'updating parent node!' } ) )
159155 }
160156
157+ console . log ( `End equalification` ) ;
158+ console . log ( `End processScans` ) ;
159+
161160 await db . clean ( ) ;
162- console . log ( `END PROCESS SCANS` ) ;
163161 return ;
164162}
165163
@@ -282,75 +280,4 @@ const scanProcessor = async ({ result, jobId, userId, propertyId }) => {
282280 } ) ;
283281
284282 return result . nodes . map ( obj => obj . id ) ;
285- }
286-
287- const normalizeHtmlWithRegex = ( html ) => {
288- if ( ! html ) return html ;
289-
290- // Remove query parameters from src and href attributes
291- html = html . replace ( / ( s r c | h r e f ) = ( [ " ' ] ) ( [ ^ ? " ' ] * ) \? [ ^ " ' ] * ?\2/ g, '$1=$2$3$2' ) ;
292-
293- // Normalize data-version paths with version hashes
294- html = html . replace ( / d a t a - v e r s i o n = ( [ " ' ] ) ( \/ s \/ p l a y e r \/ ) [ a - z A - Z 0 - 9 ] { 8 , } ( \/ .* ?\2) / g, 'data-version=$1$2NORMALIZED$3' ) ;
295-
296- // Normalize tabindex attributes
297- html = html . replace ( / t a b i n d e x = [ " ' ] - ? \d + [ " ' ] / g, 'tabindex="NORMALIZED"' ) ;
298-
299- return html ;
300- } ;
301-
302- const normalizeHtmlWithVdom = ( html ) => {
303- if ( ! html ) return '' ;
304-
305- const $ = cheerio . load ( `<div id="wrapper">${ html } </div>` ) ;
306- const root = $ ( '#wrapper' ) ;
307-
308- // Process all elements
309- root . find ( '*' ) . each ( function ( ) {
310- const el = $ ( this ) ;
311-
312- // Normalize IDs with numbers
313- if ( el . attr ( 'id' ) && / \d { 4 , } / . test ( el . attr ( 'id' ) ) ) {
314- el . attr ( 'id' , 'NORMALIZED' ) ;
315- }
316-
317- // Always normalize tabindex
318- if ( el . attr ( 'tabindex' ) ) {
319- el . attr ( 'tabindex' , 'NORMALIZED' ) ;
320- }
321-
322- // Remove query params from URLs
323- [ 'src' , 'href' ] . forEach ( attr => {
324- if ( el . attr ( attr ) && el . attr ( attr ) . includes ( '?' ) ) {
325- el . attr ( attr , el . attr ( attr ) . split ( '?' ) [ 0 ] ) ;
326- }
327- } ) ;
328-
329- // Handle h5p quiz elements
330- if ( el . hasClass ( 'h5p-sc-alternative' ) ) {
331- if ( el . hasClass ( 'h5p-sc-is-correct' ) || el . hasClass ( 'h5p-sc-is-wrong' ) ) {
332- el . removeClass ( 'h5p-sc-is-correct h5p-sc-is-wrong' )
333- . addClass ( 'h5p-sc-is-NORMALIZED' ) ;
334- }
335- }
336-
337- // Normalize data-version attributes
338- if ( el . attr ( 'data-version' ) && el . attr ( 'data-version' ) . includes ( '/s/player/' ) ) {
339- el . attr ( 'data-version' , el . attr ( 'data-version' )
340- . replace ( / \/ s \/ p l a y e r \/ [ a - z A - Z 0 - 9 ] { 8 , } \/ / , '/s/player/NORMALIZED/' ) ) ;
341- }
342-
343- // Add more element-specific normalizations based on your data patterns
344- } ) ;
345-
346- // Remove all whitespace between tags for more reliable comparison
347- let result = root . html ( ) ;
348-
349- // Remove excess whitespace for more consistent matching
350- result = result . replace ( / > \s + < / g, '><' ) ;
351-
352- // Remove all text node whitespace variations
353- result = result . replace ( / \s { 2 , } / g, ' ' ) ;
354-
355- return result ;
356- }
283+ }
0 commit comments