Skip to content

Commit 6347e1a

Browse files
committed
xxhash impl prototype
1 parent 356beb7 commit 6347e1a

File tree

10 files changed

+169
-31
lines changed

10 files changed

+169
-31
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@
77
[submodule "dep/jemalloc"]
88
path = dep/jemalloc
99
url = https://github.com/jemalloc/jemalloc
10+
[submodule "dep/xxHash"]
11+
path = dep/xxHash
12+
url = https://github.com/Cyan4973/xxHash

CMakeLists.txt

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ set(ENABLE_CJSON_TEST OFF)
99
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
1010
add_subdirectory(dep/cJSON)
1111

12+
# link xxHash statically
13+
set(XXHASH_BUILD_XXHSUM OFF)
14+
option(BUILD_SHARED_LIBS OFF)
15+
add_subdirectory(dep/xxHash/build/cmake xxhash_build EXCLUDE_FROM_ALL)
16+
1217
# use strict C11
1318
set(CMAKE_C_STANDARD 11)
1419
set(CMAKE_C_STANDARD_REQUIRED ON)
@@ -29,16 +34,16 @@ add_executable(fsautoproc ${SOURCES} ${HEADERS})
2934
add_compile_definitions(JEMALLOC_NO_DEMANGLE)
3035

3136
target_link_directories(fsautoproc PRIVATE dep dep/jemalloc/lib)
32-
target_link_libraries(fsautoproc PRIVATE cjson pthread libjemalloc.a)
37+
target_link_libraries(fsautoproc PRIVATE cjson pthread libjemalloc.a xxhash)
3338
target_include_directories(fsautoproc PRIVATE include dep dep/jemalloc/include)
3439

3540
install(TARGETS fsautoproc DESTINATION bin)
3641

3742
# libdeng shared library for unit tests
38-
add_library(deng STATIC src/deng.c src/index.c src/fs.c src/je.c)
43+
add_library(deng STATIC src/deng.c src/index.c src/fs.c src/je.c src/xx.c)
3944

4045
target_link_directories(deng PUBLIC dep/jemalloc/lib)
41-
target_link_libraries(deng PUBLIC libjemalloc.a)
46+
target_link_libraries(deng PUBLIC libjemalloc.a xxhash)
4247
target_include_directories(deng PUBLIC include dep dep/jemalloc/include)
4348

4449
# unit tests

dep/xxHash

Submodule xxHash added at c961fbe

include/index.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ struct inode_s {
1414
char* fp; ///< File path (string duplicated)
1515
uint64_t fphash; ///< File path hash value
1616
struct fsstat_s st; ///< File stat info structure
17+
uint64_t xx; ///< xxHash64 hash value
1718
struct inode_s* next;///< Next node in the index map
1819
};
1920

@@ -78,10 +79,11 @@ int indexread(struct index_s* idx, FILE* s);
7879
/// @param fp The file path to use for the new node, duplicated internally
7980
/// @param fphash The file path hash value to use for the new node
8081
/// @param st The file stat info to use for the new node
82+
/// @param xx The xxHash64 hash value to use for the new node
8183
/// @return The pointer to the new node in the index map, otherwise NULL is
8284
/// returned and `errno` is set.
8385
struct inode_s* indexput(struct index_s* idx, const char* fp, uint64_t fphash,
84-
const struct fsstat_s* st);
86+
const struct fsstat_s* st, uint64_t xx);
8587

8688
/// @brief Frees all nodes in the index map.
8789
/// @param idx The index to free

include/xx.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/// @file xx.h
2+
/// @brief xxHash64 file hashing functions.
3+
#ifndef FSAUTOPROC_XX_H
4+
#define FSAUTOPROC_XX_H
5+
6+
#include <stddef.h>
7+
#include <stdint.h>
8+
9+
/// @def XXBUFSZE
10+
/// @brief Size of the buffer used for reading files to hash.
11+
#define XXBUFSZE 65536
12+
13+
/// @struct xxreq_s
14+
/// @brief File hash request structure for processing file hash changes.
15+
struct xxreq_s {
16+
uint8_t* b; ///< Buffer for reading file data
17+
const char* fp; ///< Filepath to hash
18+
const struct fsstat_s* pst;///< Previous file stat info, may be NULL
19+
const struct fsstat_s* cst;///< Current file stat info
20+
uint64_t pxx; ///< Previous xxHash64 value, or 0 if unknown
21+
};
22+
23+
/// @brief Processes a file hash request. If the file's last modified time or
24+
/// size differ from the previous stat info \p pst, the file is re-hashed
25+
/// and the current hash value is returned. Otherwise the previous \p pxx hash
26+
/// value is returned indicating no change.
27+
/// @param req The file hash request structure containing the filepath, previous
28+
/// and current stat info, and previous hash value (if known).
29+
/// @param cause Optional string indicating the cause for the hash request, for
30+
/// logging purposes. May be NULL.
31+
/// @return 0 if an error occurred, otherwise the current hash value.
32+
uint64_t xxupdate(struct xxreq_s* req, const char* cause);
33+
34+
#endif//FSAUTOPROC_XX_H

src/deng.c

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,13 @@
1010
#include "fs.h"
1111
#include "index.h"
1212
#include "log.h"
13+
#include "xx.h"
14+
15+
/// @def XXBUFSZE
16+
/// @brief Size of the buffer used for reading files to hash.
17+
#define XXBUFSZE 65536
18+
19+
static uint8_t xxbuf[XXBUFSZE];///< Shared buffer for reading files to hash
1320

1421
/// @struct deng_state_s
1522
/// @brief Search state context provided to the diff engine as user data which
@@ -58,14 +65,15 @@ static int stagepre(const char* fp, const struct fsstat_s* st, void* udata) {
5865
// attempt to match file in previous index
5966
struct inode_s* prev = indexfind(mach->lastmap, fp, fphash);
6067

61-
// lookup from previous iteration or insert new record and lookup
62-
struct inode_s* curr = indexfind(mach->thismap, fp, fphash);
63-
if (curr == NULL)
64-
if ((curr = indexput(mach->thismap, fp, fphash, st)) == NULL) return -1;
68+
// insert node for current index
69+
struct inode_s* curr = indexput(mach->thismap, fp, fphash, st, 0);
70+
if (!curr) return -1;
6571

6672
if (prev != NULL) {
67-
if (prev->st.lmod != st->lmod || prev->st.fsze != st->fsze)
68-
callevent(mach, DENG_FEVENT_MOD, curr);
73+
// file existed in previous index, check if modified
74+
struct xxreq_s req = {xxbuf, fp, &prev->st, st, prev->xx};
75+
curr->xx = xxupdate(&req, "stage=pre,node=existing");
76+
if (req.pxx != curr->xx) callevent(mach, DENG_FEVENT_MOD, curr);
6977
} else {
7078
callevent(mach, DENG_FEVENT_NEW, curr);
7179
}
@@ -86,16 +94,28 @@ static int stagepost(const char* fp, const struct fsstat_s* st, void* udata) {
8694

8795
if (mach->ffn != NULL && mach->ffn(fp)) return 0;// skip filtered files
8896

89-
const uint64_t fphash = indexhash(fp);
97+
struct xxreq_s req = {xxbuf, fp, NULL, st, 0};
98+
char* cause = NULL;
9099

100+
const uint64_t fphash = indexhash(fp);
91101
struct inode_s* curr = indexfind(mach->thismap, fp, fphash);
92102
if (curr != NULL) {
93-
curr->st = *st;// update the file info in the current index
94-
return 0;
103+
req.pst = &curr->st;
104+
req.pxx = curr->xx;
105+
cause = "stage=post,node=existing";
106+
} else {
107+
cause = "stage=post,node=new";
95108
}
96109

97-
if ((curr = indexput(mach->thismap, fp, fphash, st)) == NULL) return -1;
98-
callevent(mach, DENG_FEVENT_NEW, curr);
110+
const uint64_t xx = xxupdate(&req, cause);
111+
112+
if (curr != NULL) {
113+
curr->xx = xx;// update hash of existing record
114+
} else {
115+
// file was not found in previous stage, insert new record
116+
if ((curr = indexput(mach->thismap, fp, fphash, st, xx)) == NULL) return -1;
117+
callevent(mach, DENG_FEVENT_NEW, curr);
118+
}
99119

100120
return 0;
101121
}

src/index.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@ int indexwrite(struct index_s* idx, FILE* s) {
6464
int err = 0;
6565
for (long i = 0; i < idx->size; i++) {
6666
struct inode_s* node = fl[i];
67-
const int n = snprintf(lbuf, sizeof(lbuf), "%s,%" PRIu64 ",%" PRIu64 "\n",
68-
node->fp, node->st.lmod, node->st.fsze);
67+
const int n = snprintf(lbuf, sizeof(lbuf),
68+
"%s,%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n", node->fp,
69+
node->st.lmod, node->st.fsze, node->xx);
6970
if (fwrite(lbuf, n, 1, s) != 1) {
7071
err = -1;
7172
break;
@@ -78,10 +79,11 @@ int indexwrite(struct index_s* idx, FILE* s) {
7879
int indexread(struct index_s* idx, FILE* s) {
7980
char fp[INDEXMAXFP] = {0}; /* fscanf filepath string buffer */
8081
struct fsstat_s st = {0}; /* fscanf file stat structure */
81-
while (fscanf(s, "%[^,],%" PRIu64 ",%" PRIu64 "\n", fp, &st.lmod, &st.fsze) ==
82-
3) {
82+
uint64_t xx = 0; /* fscanf xxHash64 value */
83+
while (fscanf(s, "%[^,],%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n", fp, &st.lmod,
84+
&st.fsze, &xx) == 4) {
8385
const uint64_t fphash = indexhash(fp);
84-
if (indexput(idx, fp, fphash, &st) == NULL) return -1;
86+
if (indexput(idx, fp, fphash, &st, xx) == NULL) return -1;
8587
}
8688
return 0;
8789
}
@@ -101,14 +103,15 @@ static void indexappend(struct ibucket_s* bucket, struct inode_s* node) {
101103
}
102104

103105
struct inode_s* indexput(struct index_s* idx, const char* fp,
104-
const uint64_t fphash, const struct fsstat_s* st) {
106+
const uint64_t fphash, const struct fsstat_s* st,
107+
const uint64_t xx) {
105108
struct inode_s* node = je_malloc(sizeof(struct inode_s));
106109
if (node == NULL) return NULL;
107110
if ((fp = je_strdup(fp)) == NULL) {// duplicate filepath string
108111
je_free(node);
109112
return NULL;
110113
}
111-
*node = (struct inode_s) {(char*) fp, fphash, *st, NULL};
114+
*node = (struct inode_s) {(char*) fp, fphash, *st, xx, NULL};
112115
struct ibucket_s* bucket = &idx->buckets[indexbucket(node->fphash)];
113116
indexappend(bucket, node);
114117
idx->size++;

src/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@ static bool filterjunk(const char* fp) {
252252
if (initargs.verbose) log_info("[j] %s", fp);
253253
} else {
254254
const struct fsstat_s st = {0};
255-
indexput(&goodmap, fp, fphash, &st);// mark as known good
255+
indexput(&goodmap, fp, fphash, &st, 0);// mark as known good
256256
}
257257
return junk;
258258
}

src/tp.c

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,19 @@
1717
#include "lcmd.h"
1818
#include "log.h"
1919
#include "tm.h"
20+
#include "xx.h"
2021

2122
/// @struct thrd_s
2223
/// @brief Initialized worker thread in the thread pool.
2324
struct thrd_s {
24-
_Atomic bool initd; ///< Thread initialized flag
25-
_Atomic bool rsrvd; ///< Work request reservation flag
26-
_Atomic bool canwork;///< Work request ready to process flag
27-
struct tpreq_s work; ///< Work request to process
28-
pthread_t tid; ///< System thread identifier
29-
_Bool fdsopen; ///< File descriptor set open flag
30-
struct fdset_s fds; ///< Output file descriptor set
25+
_Atomic bool initd; ///< Thread initialized flag
26+
_Atomic bool rsrvd; ///< Work request reservation flag
27+
_Atomic bool canwork; ///< Work request ready to process flag
28+
struct tpreq_s work; ///< Work request to process
29+
pthread_t tid; ///< System thread identifier
30+
_Bool fdsopen; ///< File descriptor set open flag
31+
struct fdset_s fds; ///< Output file descriptor set
32+
uint8_t xxbuf[XXBUFSZE];///< Buffer for reading files to hash
3133
};
3234

3335
static struct thrd_s** thrds; ///< Thread pool worker threads array
@@ -57,8 +59,16 @@ static void* tpentrypoint(void* arg) {
5759
if ((err = lcmdexec(req->cs, req->node->fp, self->fds, req->flags)))
5860
log_error("thread execution error: %d", err);
5961
if (req->flags & (LCTRIG_NEW | LCTRIG_MOD)) {
60-
if ((err = fsstat(req->node->fp, &req->node->st)))
62+
struct fsstat_s st = {0};
63+
if ((err = fsstat(req->node->fp, &st))) {
6164
log_error("stat error: %d", err);
65+
} else {
66+
// check if the stats have changed, if so, re-hash
67+
struct xxreq_s xx = {self->xxbuf, req->node->fp, &req->node->st, &st,
68+
req->node->xx};
69+
req->node->xx = xxupdate(&xx, "stage=tp");
70+
req->node->st = st;// update node with potentially new stat info
71+
}
6272
}
6373

6474
atomic_store(&self->rsrvd, false);// release the reservation

src/xx.c

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/// @file xx.c
2+
/// @brief xxHash64 file hashing function implementation.
3+
#include "xx.h"
4+
5+
#include <assert.h>
6+
#include <stdio.h>
7+
8+
#include "xxHash/xxhash.h"
9+
10+
#include "fs.h"
11+
#include "log.h"
12+
13+
/// @brief Computes the xxHash64 hash of the data read from the given file
14+
/// using a buffer for reading. The file is read in chunks until EOF.
15+
/// @param b Buffer for reading file data. Must not be NULL.
16+
/// @param f FILE object to read data from. Must not be NULL. The file should be
17+
/// opened in binary mode and positioned at the start of the data.
18+
/// @return Hash value of the file data, or 0 if an error occurred.
19+
static uint64_t xxhash(uint8_t* b, FILE* f) {
20+
assert(b != NULL);
21+
assert(f != NULL);
22+
XXH3_state_t* state = XXH3_createState();
23+
if (!state) return 0;
24+
if (XXH3_64bits_reset(state)) goto doerr;
25+
size_t n;
26+
while ((n = fread(b, 1, XXBUFSZE, f)) > 0) {
27+
if (XXH3_64bits_update(state, b, n)) goto doerr;
28+
}
29+
uint64_t h = XXH3_64bits_digest(state);
30+
XXH3_freeState(state);
31+
return h;
32+
doerr:
33+
XXH3_freeState(state);
34+
return 0;
35+
}
36+
37+
/// @brief Computes the xxHash64 hash of the file located at the given filepath.
38+
/// The file is opened, read in chunks until EOF, and then closed.
39+
/// @param b Buffer for reading file data. Must not be NULL.
40+
/// @param fp The filepath of the file to hash. Must not be NULL.
41+
/// @return The hash value of the file, or 0 if an error occurred (e.g. file
42+
/// could not be opened).
43+
static uint64_t xxhashfp(uint8_t* b, const char* fp) {
44+
assert(b != NULL);
45+
assert(fp != NULL);
46+
FILE* f = fopen(fp, "rb");
47+
if (!f) return 0;
48+
const uint64_t h = xxhash(b, f);
49+
fclose(f);
50+
return h;
51+
}
52+
53+
uint64_t xxupdate(struct xxreq_s* req, const char* cause) {
54+
assert(req != NULL);
55+
const struct fsstat_s* pst = req->pst;
56+
if (pst != NULL && pst->lmod == req->cst->lmod && pst->fsze == req->cst->fsze)
57+
return req->pxx;// no change
58+
if (cause != NULL) log_info("hashing `%s` (%s)", req->fp, cause);
59+
return xxhashfp(req->b, req->fp);
60+
}

0 commit comments

Comments
 (0)