loculus-project
diff --git a/‎backend/docs/db/schema.sql‎
Lines changed: 12 additions & 5 deletions b/‎backend/docs/db/schema.sql‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt‎
Lines changed: 8 additions & 2 deletions b/‎backend/src/main/kotlin/org/loculus/backend/api/SubmissionTypes.kt‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎backend/src/main/kotlin/org/loculus/backend/controller/SubmissionControllerDescriptions.kt‎
Lines changed: 9 additions & 7 deletions b/‎backend/src/main/kotlin/org/loculus/backend/controller/SubmissionControllerDescriptions.kt‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt‎
Lines changed: 39 additions & 29 deletions b/‎backend/src/main/kotlin/org/loculus/backend/model/SubmitModel.kt‎
Lines changed: 39 additions & 29 deletions
diff --git a/‎backend/src/main/kotlin/org/loculus/backend/service/submission/CompressionService.kt‎
Lines changed: 2 additions & 0 deletions b/‎backend/src/main/kotlin/org/loculus/backend/service/submission/CompressionService.kt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backend/src/main/kotlin/org/loculus/backend/service/submission/EmptyProcessedDataProvider.kt‎
Lines changed: 1 addition & 0 deletions b/‎backend/src/main/kotlin/org/loculus/backend/service/submission/EmptyProcessedDataProvider.kt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt‎
Lines changed: 5 additions & 0 deletions b/‎backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt‎
Lines changed: 2 additions & 1 deletion b/‎backend/src/main/kotlin/org/loculus/backend/service/submission/SubmissionDatabaseService.kt‎
Lines changed: 2 additions & 1 deletion
@@ -378,7 +378,8 @@ CREATE TABLE public.metadata_upload_aux_table (
     group_id integer,
     uploaded_at timestamp without time zone NOT NULL,
     metadata jsonb NOT NULL,
-    files jsonb
+    files jsonb,
+    fasta_ids jsonb DEFAULT '[]'::jsonb
 );
 
 
@@ -538,9 +539,8 @@ ALTER VIEW public.sequence_entries_view OWNER TO postgres;
 
 CREATE TABLE public.sequence_upload_aux_table (
     upload_id text NOT NULL,
-    submission_id text NOT NULL,
-    segment_name text NOT NULL,
-    compressed_sequence_data text NOT NULL
+    compressed_sequence_data text NOT NULL,
+    fasta_id text NOT NULL
 );
 
 
@@ -753,7 +753,7 @@ ALTER TABLE ONLY public.sequence_entries_preprocessed_data
 --
 
 ALTER TABLE ONLY public.sequence_upload_aux_table
-    ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, submission_id, segment_name);
+    ADD CONSTRAINT sequence_upload_aux_table_pkey PRIMARY KEY (upload_id, fasta_id);
 
 
 --
@@ -794,6 +794,13 @@ CREATE INDEX data_use_terms_table_accession_idx ON public.data_use_terms_table U
 CREATE INDEX flyway_schema_history_s_idx ON public.flyway_schema_history USING btree (success);
 
 
+--
+-- Name: metadata_upload_aux_table_fasta_ids_idx; Type: INDEX; Schema: public; Owner: postgres
+--
+
+CREATE INDEX metadata_upload_aux_table_fasta_ids_idx ON public.metadata_upload_aux_table USING gin (fasta_ids jsonb_path_ops);
+
+
 --
 -- Name: sequence_entries_organism_idx; Type: INDEX; Schema: public; Owner: postgres
 --
 
@@ -8,6 +8,7 @@ import com.fasterxml.jackson.databind.JsonDeserializer
 import com.fasterxml.jackson.databind.JsonNode
 import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 import io.swagger.v3.oas.annotations.media.Schema
+import org.loculus.backend.model.FastaId
 import org.loculus.backend.model.SubmissionId
 import org.loculus.backend.service.files.FileId
 import org.loculus.backend.utils.Accession
@@ -166,6 +167,11 @@ data class ProcessedData<SequenceType>(
         description = "The key is the gene name, the value is a list of amino acid insertions",
     )
     val aminoAcidInsertions: Map<GeneName, List<Insertion>>,
+    @Schema(
+        example = """{"segment1": "fastaHeader1", "segment2": "fastaHeader2"}""",
+        description = "The key is the segment name, the value is the fastaHeader of the original Data",
+    )
+    val sequenceNameToFastaHeaderMap: Map<SegmentName, String> = emptyMap(),
     @Schema(
         example = """{"raw_reads": [{"fileId": "s0m3-uUiDd", "name": "data.fastaq"}], "sequencing_logs": []}""",
         description = "The key is the file category name, the value is a list of files, with ID and name.",
@@ -300,9 +306,9 @@ data class OriginalDataInternal<SequenceType, FilesType>(
     val metadata: Map<String, String>,
     @Schema(
         example = "{\"segment1\": \"ACTG\", \"segment2\": \"GTCA\"}",
-        description = "The key is the segment name, the value is the nucleotide sequence",
+        description = "The key is the fastaID, the value is the nucleotide sequence",
     )
-    val unalignedNucleotideSequences: Map<SegmentName, SequenceType?>,
+    val unalignedNucleotideSequences: Map<FastaId, SequenceType?>,
     @Schema(
         example = """{"raw_reads": [{"fileId": "f1le-uuId-asdf", "name": "myfile.fastaq"]}""",
         description = "A map from file categories, to lists of files. The files can also have URLs.",
 
@@ -1,13 +1,13 @@
 package org.loculus.backend.controller
 
-import org.loculus.backend.model.HEADER_TO_CONNECT_METADATA_AND_SEQUENCES
+import org.loculus.backend.model.METADATA_ID_HEADER
 
 const val SUBMIT_RESPONSE_DESCRIPTION = """
 Returns a list of accession, version and submissionId of the submitted sequence entries. 
-The submissionId is the (locally unique) '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' provided by the submitter in the metadata file. 
+The submissionId is the (locally unique) '$METADATA_ID_HEADER' provided by the submitter in the metadata file. 
 The version will be 1 for every sequence. 
 The accession is the (globally unique) id that the system assigned to the sequence entry. 
-You can use this response to associate the user provided $HEADER_TO_CONNECT_METADATA_AND_SEQUENCES with the system assigned accession.
+You can use this response to associate the user provided $METADATA_ID_HEADER with the system assigned accession.
 """
 
 const val SUBMIT_ERROR_RESPONSE = """
@@ -18,16 +18,18 @@ const val METADATA_FILE_DESCRIPTION = """
 A TSV (tab separated values) file containing the metadata of the submitted sequence entries. 
 The file may be compressed with zstd, xz, zip, gzip, lzma, bzip2 (with common extensions).
 It must contain the column names.
-The field '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' is required and must be unique within the provided dataset.
+The field '$METADATA_ID_HEADER' is required and must be unique within the provided dataset.
 It is used to associate metadata to the sequences in the sequences fasta file.
 """
+
+// TODO: update description
 const val SEQUENCE_FILE_DESCRIPTION = """
 A fasta file containing the unaligned nucleotide sequences of the submitted sequences.
 The file may be compressed with zstd, xz, zip, gzip, lzma, bzip2 (with common extensions).
 If the underlying organism has a single segment,
-the headers of the fasta file must match the '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' field in the metadata file.
+the headers of the fasta file must match the '$METADATA_ID_HEADER' field in the metadata file.
 If the underlying organism has multiple segments,
-the headers of the fasta file must be of the form '>[$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES]_[segmentName]'.
+the headers of the fasta file must be of the form '>[$METADATA_ID_HEADER]_[segmentName]'.
 """
 
 const val FILE_MAPPING_DESCRIPTION = """
@@ -114,7 +116,7 @@ The version will increase by one in respect to the original accession version.
 
 const val REVISED_METADATA_FILE_DESCRIPTION = """
 A TSV (tab separated values) file containing the metadata of the revised data.
-The first row must contain the column names. The column '$HEADER_TO_CONNECT_METADATA_AND_SEQUENCES' is required and must be unique within the 
+The first row must contain the column names. The column '$METADATA_ID_HEADER' is required and must be unique within the 
 provided dataset. It is used to associate metadata to the sequences in the sequences fasta file.
 Additionally, the column 'accession' is required and must match the accession of the original sequence entry.
 """
 
@@ -31,13 +31,15 @@ import java.io.BufferedInputStream
 import java.io.File
 import java.io.InputStream
 
-const val HEADER_TO_CONNECT_METADATA_AND_SEQUENCES = "id"
-const val HEADER_TO_CONNECT_METADATA_AND_SEQUENCES_ALTERNATE_FOR_BACKCOMPAT = "submissionId"
+const val METADATA_ID_HEADER = "id"
+const val METADATA_ID_HEADER_ALTERNATE_FOR_BACKCOMPAT = "submissionId"
+const val FASTA_ID_HEADER = "fastaId"
 
 const val ACCESSION_HEADER = "accession"
 private val log = KotlinLogging.logger { }
 
 typealias SubmissionId = String
+typealias FastaId = String
 typealias SegmentName = String
 
 const val UNIQUE_CONSTRAINT_VIOLATION_SQL_STATE = "23505"
@@ -126,8 +128,13 @@ class SubmitModel(
         val metadataSubmissionIds = uploadDatabaseService.getMetadataUploadSubmissionIds(uploadId).toSet()
         if (requiresConsensusSequenceFile(submissionParams.organism)) {
             log.debug { "Validating submission with uploadId $uploadId" }
-            val sequenceSubmissionIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet()
-            validateSubmissionIdSetsForConsensusSequences(metadataSubmissionIds, sequenceSubmissionIds)
+            val metadataFastaIds = uploadDatabaseService.getFastaIdsForMetadata(uploadId).flatten()
+            val metadataFastaIdsSet = metadataFastaIds.toSet()
+            if (metadataFastaIdsSet.size < metadataFastaIds.size) {
+                throw UnprocessableEntityException("Metadata file contains duplicate fastaIds.")
+            }
+            val sequenceFastaIds = uploadDatabaseService.getSequenceUploadSubmissionIds(uploadId).toSet()
+            validateSubmissionIdSetsForConsensusSequences(metadataFastaIdsSet, sequenceFastaIds)
         }
 
         if (submissionParams is SubmissionParams.RevisionSubmissionParams) {
@@ -167,38 +174,39 @@ class SubmitModel(
             metadataFileTypes,
             metadataTempFileToDelete,
         )
+        val addFastaId = requiresConsensusSequenceFile(submissionParams.organism)
         try {
-            uploadMetadata(uploadId, submissionParams, metadataStream, batchSize)
+            uploadMetadata(uploadId, submissionParams, metadataStream, batchSize, addFastaId = addFastaId)
         } finally {
             metadataTempFileToDelete.delete()
         }
 
         val sequenceFile = submissionParams.sequenceFile
         if (sequenceFile == null) {
-            if (requiresConsensusSequenceFile(submissionParams.organism)) {
+            if (addFastaId) {
                 throw BadRequestException(
                     "Submissions for organism ${submissionParams.organism.name} require a sequence file.",
                 )
             }
-        } else {
-            if (!requiresConsensusSequenceFile(submissionParams.organism)) {
-                throw BadRequestException(
-                    "Sequence uploads are not allowed for organism ${submissionParams.organism.name}.",
-                )
-            }
+            return
+        }
+        if (!addFastaId) {
+            throw BadRequestException(
+                "Sequence uploads are not allowed for organism ${submissionParams.organism.name}.",
+            )
+        }
 
-            val sequenceTempFileToDelete = MaybeFile()
-            try {
-                val sequenceStream = getStreamFromFile(
-                    sequenceFile,
-                    uploadId,
-                    sequenceFileTypes,
-                    sequenceTempFileToDelete,
-                )
-                uploadSequences(uploadId, sequenceStream, batchSize, submissionParams.organism)
-            } finally {
-                sequenceTempFileToDelete.delete()
-            }
+        val sequenceTempFileToDelete = MaybeFile()
+        try {
+            val sequenceStream = getStreamFromFile(
+                sequenceFile,
+                uploadId,
+                sequenceFileTypes,
+                sequenceTempFileToDelete,
+            )
+            uploadSequences(uploadId, sequenceStream, batchSize, submissionParams.organism)
+        } finally {
+            sequenceTempFileToDelete.delete()
         }
     }
 
@@ -244,6 +252,7 @@ class SubmitModel(
         submissionParams: SubmissionParams,
         metadataStream: InputStream,
         batchSize: Int,
+        addFastaId: Boolean,
     ) {
         log.debug {
             "intermediate storing uploaded metadata of type ${submissionParams.uploadType.name} " +
@@ -253,7 +262,7 @@ class SubmitModel(
         try {
             when (submissionParams) {
                 is SubmissionParams.OriginalSubmissionParams -> {
-                    metadataEntryStreamAsSequence(metadataStream)
+                    metadataEntryStreamAsSequence(metadataStream, addFastaId)
                         .chunked(batchSize)
                         .forEach { batch ->
                             uploadDatabaseService.batchInsertMetadataInAuxTable(
@@ -269,7 +278,7 @@ class SubmitModel(
                 }
 
                 is SubmissionParams.RevisionSubmissionParams -> {
-                    revisionEntryStreamAsSequence(metadataStream)
+                    revisionEntryStreamAsSequence(metadataStream, addFastaId)
                         .chunked(batchSize)
                         .forEach { batch ->
                             uploadDatabaseService.batchInsertRevisedMetadataInAuxTable(
@@ -344,14 +353,15 @@ class SubmitModel(
 
         if (metadataKeysNotInSequences.isNotEmpty() || sequenceKeysNotInMetadata.isNotEmpty()) {
             val metadataNotPresentErrorText = if (metadataKeysNotInSequences.isNotEmpty()) {
-                "Metadata file contains ${metadataKeysNotInSequences.size} ids that are not present " +
+                "Metadata file contains ${metadataKeysNotInSequences.size} FASTA ids that are not present " +
                     "in the sequence file: " + metadataKeysNotInSequences.toList().joinToString(limit = 10) + "; "
             } else {
                 ""
             }
             val sequenceNotPresentErrorText = if (sequenceKeysNotInMetadata.isNotEmpty()) {
-                "Sequence file contains ${sequenceKeysNotInMetadata.size} ids that are not present " +
-                    "in the metadata file: " + sequenceKeysNotInMetadata.toList().joinToString(limit = 10)
+                "Sequence file contains ${sequenceKeysNotInMetadata.size} FASTA ids that are not present " +
+                    "in the metadata file: " +
+                    sequenceKeysNotInMetadata.toList().joinToString(limit = 10)
             } else {
                 ""
             }
 
@@ -102,6 +102,7 @@ class CompressionService(private val compressionDictService: CompressionDictServ
             }
         },
         processedData.aminoAcidInsertions,
+            processedData.sequenceNameToFastaHeaderMap,
         processedData.files,
     )
 
@@ -128,6 +129,7 @@ class CompressionService(private val compressionDictService: CompressionDictServ
             }
         },
         processedData.aminoAcidInsertions,
+        processedData.sequenceNameToFastaHeaderMap,
         processedData.files,
     )
 
 
@@ -20,6 +20,7 @@ class EmptyProcessedDataProvider(private val backendConfig: BackendConfig) {
             alignedAminoAcidSequences = referenceGenome.genes.map { it.name }.associateWith { null },
             nucleotideInsertions = referenceGenome.nucleotideSequences.map { it.name }.associateWith { emptyList() },
             aminoAcidInsertions = referenceGenome.genes.map { it.name }.associateWith { emptyList() },
+            sequenceNameToFastaHeaderMap = referenceGenome.nucleotideSequences.map { it.name }.associateWith { "" },
             files = null,
         )
     }
 
@@ -232,6 +232,11 @@ class ProcessedSequenceEntryValidator(private val schema: Schema, private val re
             "alignedNucleotideSequences",
         )
 
+        validateNoUnknownSegment(
+            processedData.sequenceNameToFastaHeaderMap,
+            "sequenceNameToFastaHeaderMap",
+        )
+
         validateNoUnknownSegment(
             processedData.unalignedNucleotideSequences,
             "unalignedNucleotideSequences",
 
@@ -457,6 +457,7 @@ class SubmissionDatabaseService(
         aminoAcidInsertions = processedData.aminoAcidInsertions.mapValues { (_, it) ->
             it.map { insertion -> insertion.copy(sequence = insertion.sequence.uppercase(Locale.US)) }
         },
+        sequenceNameToFastaHeaderMap = processedData.sequenceNameToFastaHeaderMap,
     )
 
     private fun validateExternalMetadata(
@@ -1224,7 +1225,7 @@ class SubmissionDatabaseService(
             .fetchSize(streamBatchSize)
             .asSequence()
             .map {
-                // Revoked sequences have no original metdadata, hence null can happen
+                // Revoked sequences have no original metadata, hence null can happen
                 @Suppress("USELESS_ELVIS")
                 val metadata = it[originalMetadata] ?: null
                 val selectedMetadata = fields?.associateWith { field -> metadata?.get(field) }
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@ class CompressionService(private val compressionDictService: CompressionDictServ`
`102`	`102`	`}`
`103`	`103`	`},`
`104`	`104`	`processedData.aminoAcidInsertions,`
	`105`	`+ processedData.sequenceNameToFastaHeaderMap,`
`105`	`106`	`processedData.files,`
`106`	`107`	`)`
`107`	`108`
`@@ -128,6 +129,7 @@ class CompressionService(private val compressionDictService: CompressionDictServ`
`128`	`129`	`}`
`129`	`130`	`},`
`130`	`131`	`processedData.aminoAcidInsertions,`
	`132`	`+ processedData.sequenceNameToFastaHeaderMap,`
`131`	`133`	`processedData.files,`
`132`	`134`	`)`
`133`	`135`
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@ class EmptyProcessedDataProvider(private val backendConfig: BackendConfig) {`
`20`	`20`	`alignedAminoAcidSequences = referenceGenome.genes.map { it.name }.associateWith { null },`
`21`	`21`	`nucleotideInsertions = referenceGenome.nucleotideSequences.map { it.name }.associateWith { emptyList() },`
`22`	`22`	`aminoAcidInsertions = referenceGenome.genes.map { it.name }.associateWith { emptyList() },`
	`23`	`+ sequenceNameToFastaHeaderMap = referenceGenome.nucleotideSequences.map { it.name }.associateWith { "" },`
`23`	`24`	`files = null,`
`24`	`25`	`)`
`25`	`26`	`}`