Skip to content

Commit e213c5e

Browse files
tsibleyvictorlin
authored andcommitted
Expose Nextclade dataset reference trees under /nextclade/…
The new Nextclade source class and related classes in the Source/Resource/Subresource framework provide access the "latest" Nextclade dataset reference trees and resolve a myriad of supported aliases. The resource indexer is extended with a new "resource collection" for Nextclade that essentially transforms the existing Nextclade indexes into what our resource indexer expects. This, in turns, fits into existing resourceIndexer/ and src/resourceIndex.js code to provide access to historical versions of Nextclade dataset reference trees. Bumps the resource index version to v9 since changes were made to resourceIndex/. The static-site/app/nextclade/… files were largely copied from static-site/app/staging/… and then modified to refer to the "nextclade" source instead. There is a lot of boilerplate and duplication. But that appears to be the way it's been done for other usages, and I don't have time to make it better so close to the eve of my departure. The biggest differences are in the resources.tsx file. Resolves: <#1156>
1 parent b900c41 commit e213c5e

File tree

19 files changed

+700
-16
lines changed

19 files changed

+700
-16
lines changed

.github/workflows/index-resources.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ jobs:
8585
node resourceIndexer/main.js \
8686
--gzip --output resources.json.gz \
8787
--resourceTypes dataset intermediate \
88-
--collections core staging
88+
--collections core staging nextclade
8989
- name: Upload the new index, overwriting the existing index
9090
if: ${{ startsWith(env.RESOURCE_INDEX, 's3://') }}
9191
run: |

data/manifest_core.json

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -582,12 +582,6 @@
582582
"default": "open"
583583
}
584584
},
585-
"nextclade": {
586-
"dataset": {
587-
"sars-cov-2": "",
588-
"default": "sars-cov-2"
589-
}
590-
},
591585
"nipah": {
592586
"resolution": {
593587
"all": "",

env/production/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,6 @@
110110
"OIDC_GROUPS_CLAIM": "cognito:groups",
111111
"SESSION_COOKIE_DOMAIN": "nextstrain.org",
112112
"GROUPS_DATA_FILE": "groups.json",
113-
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources/v8.json.gz",
113+
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources/v9.json.gz",
114114
"PLAUSIBLE_ANALYTICS_DOMAIN": "nextstrain.org"
115115
}

env/testing/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,5 @@
108108
"OIDC_USERNAME_CLAIM": "cognito:username",
109109
"OIDC_GROUPS_CLAIM": "cognito:groups",
110110
"GROUPS_DATA_FILE": "groups.json",
111-
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources/v8.json.gz"
111+
"RESOURCE_INDEX": "s3://nextstrain-inventories/resources/v9.json.gz"
112112
}

resourceIndexer/main.js

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import { ArgumentParser } from 'argparse';
33
import fs from 'fs';
44
import { coreS3Data, stagingS3Data } from "./coreStagingS3.js";
5+
import { NextcladeData } from "./nextclade.js";
56
import zlib from 'zlib';
67
import { promisify } from 'util';
78
import { ResourceIndexerError } from './errors.js';
@@ -19,8 +20,8 @@ const gzip = promisify(zlib.gzip)
1920
* (sub-)class and resourcePath to parallel the information in the Resource
2021
* (sub-)class.
2122
*
22-
* Currently only sources {core, staging} and resource types {dataset,
23-
* intermediate} are part of the index.
23+
* Currently only sources {core, staging, nextclade} and resource types
24+
* {dataset, intermediate} are part of the index.
2425
*
2526
* As an example, the core WNV/NA (nextstrain.org/WNV/NA) dataset is indexed
2627
* like so:
@@ -34,6 +35,7 @@ const gzip = promisify(zlib.gzip)
3435
const COLLECTIONS = [
3536
coreS3Data,
3637
stagingS3Data,
38+
new NextcladeData(),
3739
];
3840

3941
function parseArgs() {
@@ -47,13 +49,13 @@ function parseArgs() {
4749
`,
4850
});
4951
argparser.addArgument("--local", {action: 'storeTrue',
50-
help: 'Access a local copy of S3 inventories within ./devData/. See docstring of fetchInventoryLocal() for expected filenames.'})
52+
help: 'Access a local copy of S3 inventories ({core,staging}.manifest.json and {core,staging}-*.csv.gz) and Nextclade indexes (nextclade/index.json and nextclade/**/pathogen.json) within ./devData/ instead of downloading them'})
5153
argparser.addArgument("--collections", {metavar: "<name>", type: "string", nargs: '+', choices: COLLECTIONS.map((c) => c.name),
5254
help: "Only fetch data from a subset of collections. Source names are those defined in COLLECTIONS"});
5355
argparser.addArgument("--resourceTypes", {metavar: "<name>", type: "string", nargs: '+', choices: ['dataset', 'intermediate'],
5456
help: "Only index data matching specified resource types"});
5557
argparser.addArgument("--save-inventories", {action: 'storeTrue',
56-
help: "Save the fetched inventories + manifest files to ./devData so that future invocations can use --local"});
58+
help: "Save a local copy of S3 inventories and Nextclade indexes to ./devData/ so that future invocations can use --local"});
5759
argparser.addArgument("--output", {metavar: "<json>", required: true})
5860
argparser.addArgument("--indent", {action: 'storeTrue', help: 'Indent the output JSON'})
5961
argparser.addArgument("--gzip", {action: 'storeTrue', help: 'GZip the output JSON'})
@@ -118,4 +120,4 @@ async function main(args) {
118120
output = await gzip(output)
119121
}
120122
fs.writeFileSync(args.output, output);
121-
}
123+
}

resourceIndexer/nextclade.js

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/**
2+
* Index Nextclade dataset reference trees, including past versions.
3+
*
4+
* Transforms Nextclade's own index for use with our resourceIndexer/… and
5+
* src/resourceIndex.js framework.
6+
*/
7+
import { strict as assert } from "assert";
8+
import { DateTime } from "luxon";
9+
import { readFile, writeFile, mkdir } from "node:fs/promises";
10+
import path from "node:path";
11+
12+
import { fetch } from "../src/fetch.js";
13+
import { NextcladeSource } from "../src/sources/nextclade.js";
14+
import { rootDirFullPath } from "../src/utils/index.js";
15+
16+
17+
const LOCAL_DATA = path.relative(".", path.join(rootDirFullPath, "devData", "nextclade"));
18+
const LOCAL_INDEX = path.join(LOCAL_DATA, "index.json");
19+
20+
21+
/* All class members are part of the "collection" interface expected by
22+
* resourceIndexer/main.js and use its terminology for arguments and return
23+
* values. This interface is kind of a weird fit for things that aren't S3
24+
* inventories, so the chain of methods and way they pass values are a bit
25+
* contrived.
26+
*/
27+
export class NextcladeData {
28+
#source;
29+
30+
name = "nextclade";
31+
32+
async collect({local, save}) {
33+
if (local) {
34+
console.log(`Reading ${LOCAL_INDEX}`);
35+
this.#source = new NextcladeSource(JSON.parse(await readFile(LOCAL_INDEX)));
36+
}
37+
else {
38+
this.#source = new NextcladeSource();
39+
40+
if (save) {
41+
console.log(`Saving ${LOCAL_INDEX}`);
42+
await mkdir(path.dirname(LOCAL_INDEX), {recursive: true});
43+
await writeFile(LOCAL_INDEX, JSON.stringify(await this.#source._index(), null, 2));
44+
}
45+
}
46+
47+
const datasetPaths = await this.#source.availableDatasets();
48+
49+
return (await Promise.all(
50+
datasetPaths.map(async (datasetPath) => {
51+
const dataset = this.#source.dataset(datasetPath.split("/"));
52+
const indexDataset = await dataset._indexDataset();
53+
54+
/* Sort and collapse versions per our documented behaviour:
55+
*
56+
* > All times are UTC. A datestamp refers to datasets uploaded
57+
* > between 00h00 and 23h59 UTC on that day.
58+
*
59+
* > If multiple datasets are uploaded on the same day we take the most
60+
* > recent.
61+
*
62+
* See <https://docs.nextstrain.org/page/guides/snapshots.html#details-for-dataset-maintainers>.
63+
*/
64+
const datesSeen = new Set();
65+
const indexVersions =
66+
indexDataset.versions
67+
.map(v => ({...v, _timestamp: DateTime.fromISO(v.updatedAt, {zone:"UTC"})}))
68+
.toSorted((a, b) => b._timestamp - a._timestamp)
69+
.map(v => ({...v, _date: v._timestamp.toISODate()}))
70+
.filter(v => !datesSeen.has(v._date) && datesSeen.add(v._date))
71+
72+
// Produce one resourceIndexer/main.js "item" per dataset version
73+
return (await Promise.all(
74+
indexVersions.map(async (indexVersion) => {
75+
const versionMetaPath = `${indexDataset.path}/${indexVersion.tag}/pathogen.json`;
76+
77+
const localFile = path.join(LOCAL_DATA, versionMetaPath);
78+
79+
let versionMeta;
80+
81+
if (local) {
82+
console.log(`Reading ${localFile}`);
83+
versionMeta = JSON.parse(await readFile(localFile));
84+
}
85+
else {
86+
const remoteUrl = await this.#source.urlFor(versionMetaPath);
87+
88+
console.log(`Fetching ${remoteUrl}`);
89+
const response = await fetch(remoteUrl, {cache: "no-cache"});
90+
assert(response.status === 200);
91+
92+
versionMeta = await response.json();
93+
94+
if (save) {
95+
console.log(`Saving ${localFile}`);
96+
await mkdir(path.dirname(localFile), {recursive: true});
97+
await writeFile(localFile, JSON.stringify(versionMeta, null, 2));
98+
}
99+
}
100+
101+
/* This filter must be *after* we fetch the version's own
102+
* pathogen.json. Because versions are filtered to one-per-day
103+
* *before* we fetch, it's possible there's an older version from
104+
* the same day that *does* include a treeJson, and we'd miss it.
105+
* The fix would be fetching *all* versions and only then filtering
106+
* to one-per-day (i.e. in createResource() below).
107+
*
108+
* Doing so, however, seems unnecessary. The scenario seems
109+
* unlikely and it's not entirely clear how we'd want to interpret
110+
* such a dataset update anyway (e.g. was the earlier version on
111+
* the same day in error?).
112+
*
113+
* Also note that this filters out some datasets entirely: those
114+
* that don't have a reference tree at all.
115+
* -trs, 27 Oct 2025
116+
*/
117+
if (!versionMeta.files.treeJson)
118+
return;
119+
120+
// One "item" produced by collect()
121+
return {
122+
// Used by resourceIndexer/main.js
123+
source: this.#source.name,
124+
resourceType: "dataset",
125+
resourcePath: datasetPath,
126+
127+
// Used in createResource() below
128+
version: {
129+
date: indexVersion._date,
130+
fileUrls: {
131+
main: await this.#source.urlFor(`${indexDataset.path}/${indexVersion.tag}/${versionMeta.files.treeJson}`)
132+
}
133+
},
134+
};
135+
})
136+
)).flat();
137+
})
138+
)).flat();
139+
}
140+
141+
categorise(item) {
142+
return item;
143+
}
144+
145+
createResource(resourceType, resourcePath, items) {
146+
return {
147+
versions: items.map(i => i.version),
148+
};
149+
}
150+
}

src/app.js

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ const {
1414
errors,
1515
fetch,
1616
groups,
17+
nextclade,
1718
openid,
1819
pathogenRepos,
1920
schemas,
@@ -68,7 +69,6 @@ charon.setup(app);
6869
* /monkeypox
6970
* /mpox
7071
* /ncov
71-
* /nextclade
7272
* /rsv
7373
* /rubella
7474
* /seasonal-flu
@@ -92,6 +92,15 @@ core.setup(app);
9292
staging.setup(app);
9393

9494

95+
/* Nextclade reference datasets
96+
*
97+
* Routes:
98+
* /nextclade
99+
* /nextclade/*
100+
*/
101+
nextclade.setup(app);
102+
103+
95104
/* Community on GitHub
96105
*
97106
* Routes:

src/resourceIndex.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,8 @@ class ListResources {
191191
return ""
192192
case "staging":
193193
return "staging/"
194+
case "nextclade":
195+
return "nextclade/"
194196
default:
195197
throw new InternalServerError(`Source "${name}" does not have a corresponding prefix`)
196198
}

src/routing/core.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ const coreBuildPaths = [
4040
"/monkeypox", // Not actively updated, but YYYY-MM-DD URLs remain & don't redirect
4141
"/mpox", // monkeypox URLs will redirect to /mpox (except for datestamped URLs)
4242
"/ncov",
43-
"/nextclade",
4443
"/nipah",
4544
"/norovirus",
4645
"/oropouche",

src/routing/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export * as errors from "./errors.js";
88
export * as fetch from './fetch.js';
99
export * as groups from './groups.js';
1010
export * as listResources from './listResources.js';
11+
export * as nextclade from './nextclade.js';
1112
export * as openid from './openid.js';
1213
export * as pathogenRepos from './pathogenRepos.js';
1314
export * as schemas from './schemas.js';

0 commit comments

Comments
 (0)