From b7fc3b6928817c58bf41d94fef29e64c4e70f3c0 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Fri, 26 Dec 2025 16:23:49 +0100 Subject: [PATCH 1/2] DataLake: add support for gs:// links in Iceberg metadata This PR adds support for gs storage type when using it in GCS in S3-compatible XML API. Additionally, a NoSignCredentials was added in case GCS bucket is open and NOSIGN should be added to the request. --- src/Databases/DataLake/ICatalog.cpp | 3 ++ src/Databases/DataLake/RestCatalog.cpp | 56 +++++++++++++++------ src/Databases/DataLake/StorageCredentials.h | 13 +++++ src/Storages/ObjectStorage/Utils.cpp | 2 +- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/src/Databases/DataLake/ICatalog.cpp b/src/Databases/DataLake/ICatalog.cpp index c7ab78030411..f4f6a715c665 100644 --- a/src/Databases/DataLake/ICatalog.cpp +++ b/src/Databases/DataLake/ICatalog.cpp @@ -53,6 +53,9 @@ StorageType parseStorageTypeFromString(const std::string & type) if (capitalize_first_letter(storage_type_str) == "S3a") storage_type_str = "S3"; + if (capitalize_first_letter(storage_type_str) == "Gs") + storage_type_str = "S3"; + auto storage_type = magic_enum::enum_cast(capitalize_first_letter(storage_type_str)); if (!storage_type) diff --git a/src/Databases/DataLake/RestCatalog.cpp b/src/Databases/DataLake/RestCatalog.cpp index d267e22cd0ee..54e5867c425d 100644 --- a/src/Databases/DataLake/RestCatalog.cpp +++ b/src/Databases/DataLake/RestCatalog.cpp @@ -660,26 +660,52 @@ bool RestCatalog::getTableMetadataImpl( { case StorageType::S3: { - static constexpr auto access_key_id_str = "s3.access-key-id"; - static constexpr auto secret_access_key_str = "s3.secret-access-key"; - static constexpr auto session_token_str = "s3.session-token"; - static constexpr auto storage_endpoint_str = "s3.endpoint"; + /// S3 config keys + static constexpr auto s3_access_key_id_str = "s3.access-key-id"; + static constexpr auto s3_secret_access_key_str = "s3.secret-access-key"; + static constexpr auto s3_session_token_str = "s3.session-token"; + static constexpr auto s3_endpoint_str = "s3.endpoint"; + + /// GCS config keys (for gs:// URLs accessed via S3-compatible API) + static constexpr auto gcs_no_auth_str = "gcs.no-auth"; + static constexpr auto gcs_access_key_id_str = "gcs.access-key-id"; + static constexpr auto gcs_secret_access_key_str = "gcs.secret-access-key"; + static constexpr auto gcs_endpoint_str = "gcs.endpoint"; std::string access_key_id; std::string secret_access_key; std::string session_token; std::string storage_endpoint; - if (config_object->has(access_key_id_str)) - access_key_id = config_object->get(access_key_id_str).extract(); - if (config_object->has(secret_access_key_str)) - secret_access_key = config_object->get(secret_access_key_str).extract(); - if (config_object->has(session_token_str)) - session_token = config_object->get(session_token_str).extract(); - if (config_object->has(storage_endpoint_str)) - storage_endpoint = config_object->get(storage_endpoint_str).extract(); - - result.setStorageCredentials( - std::make_shared(access_key_id, secret_access_key, session_token)); + bool no_auth = false; + + /// Check GCS config first (for gs:// URLs) + if (config_object->has(gcs_no_auth_str)) + { + auto no_auth_value = config_object->get(gcs_no_auth_str).toString(); + no_auth = (no_auth_value == "true" || no_auth_value == "1"); + } + if (config_object->has(gcs_access_key_id_str)) + access_key_id = config_object->get(gcs_access_key_id_str).extract(); + if (config_object->has(gcs_secret_access_key_str)) + secret_access_key = config_object->get(gcs_secret_access_key_str).extract(); + if (config_object->has(gcs_endpoint_str)) + storage_endpoint = config_object->get(gcs_endpoint_str).extract(); + + /// Fall back to S3 config keys + if (config_object->has(s3_access_key_id_str)) + access_key_id = config_object->get(s3_access_key_id_str).extract(); + if (config_object->has(s3_secret_access_key_str)) + secret_access_key = config_object->get(s3_secret_access_key_str).extract(); + if (config_object->has(s3_session_token_str)) + session_token = config_object->get(s3_session_token_str).extract(); + if (config_object->has(s3_endpoint_str)) + storage_endpoint = config_object->get(s3_endpoint_str).extract(); + + if (no_auth) + result.setStorageCredentials(std::make_shared()); + else + result.setStorageCredentials( + std::make_shared(access_key_id, secret_access_key, session_token)); result.setEndpoint(storage_endpoint); break; diff --git a/src/Databases/DataLake/StorageCredentials.h b/src/Databases/DataLake/StorageCredentials.h index 80200c835cea..ca5714df8194 100644 --- a/src/Databases/DataLake/StorageCredentials.h +++ b/src/Databases/DataLake/StorageCredentials.h @@ -48,4 +48,17 @@ class S3Credentials final : public IStorageCredentials std::string session_token; }; +/// Credentials for public buckets that don't require signing (e.g., GCS with no-auth) +class NoSignCredentials final : public IStorageCredentials +{ +public: + void addCredentialsToEngineArgs(DB::ASTs & engine_args) const override + { + if (engine_args.size() != 1) + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Storage credentials specified in AST already"); + + engine_args.push_back(std::make_shared("NOSIGN")); + } +}; + } diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index c4629582ee28..fbcd277535d5 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -38,7 +38,7 @@ std::string normalizeScheme(const std::string & scheme) { auto scheme_lowercase = Poco::toLower(scheme); - if (scheme_lowercase == "s3a" || scheme_lowercase == "s3n") + if (scheme_lowercase == "s3a" || scheme_lowercase == "s3n" || scheme_lowercase == "gs") scheme_lowercase = "s3"; else if (scheme_lowercase == "wasb" || scheme_lowercase == "wasbs" || scheme_lowercase == "abfss") scheme_lowercase = "abfs"; From 4baf17c1964889a5e5e3f35bb22a6b0e78465656 Mon Sep 17 00:00:00 2001 From: Mikhail Koviazin Date: Wed, 14 Jan 2026 16:15:38 +0100 Subject: [PATCH 2/2] DataLakeCatalog: fix `data_location` if `storage_endpoint` was set If `storage_endpoint` was set during `DataLakeCatalog` creation (e.g. when using GCS in S3-compatible mode), `data_location` is populated with a `https://` link. When `endpoint` is stripped from it, there is still a leading slash in `data_location`, which makes the next `if` statement falsy and causes the query to fail. This commit fixes that by removing the leading slash if `data_location` starts with it after `endpoint` is stripped. --- src/Databases/DataLake/ICatalog.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Databases/DataLake/ICatalog.cpp b/src/Databases/DataLake/ICatalog.cpp index f4f6a715c665..d0b9edafe08b 100644 --- a/src/Databases/DataLake/ICatalog.cpp +++ b/src/Databases/DataLake/ICatalog.cpp @@ -221,7 +221,11 @@ std::string TableMetadata::getMetadataLocation(const std::string & iceberg_metad if (data_location.starts_with(storage_type_str)) data_location = data_location.substr(storage_type_str.size()); else if (!endpoint.empty() && data_location.starts_with(endpoint)) + { data_location = data_location.substr(endpoint.size()); + if (!data_location.empty() && data_location.front() == '/') + data_location = data_location.substr(1); + } if (metadata_location.starts_with(data_location)) {