diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh new file mode 100644 index 0000000000..1aa42cb6c8 --- /dev/null +++ b/http/src/main/resources/init-resources/galaxy-user-data.sh @@ -0,0 +1,150 @@ +#cloud-config +# Sourced from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh +# When updating this file, sync it manually from that repository and verify the changes. +write_files: + - path: /usr/local/bin/galaxy_bootstrap.sh + permissions: '0755' + owner: root:root + content: | + #!/bin/bash + + echo "[$(date)] - Starting galaxy_bootstrap script..." + + # 1. Setup persistent disk if available + DISK_DEVICE="/dev/disk/by-id/google-galaxy-data" + if [ -b "$DISK_DEVICE" ]; then + echo "[$(date)] - Found persistent disk at $DISK_DEVICE" + + # Check if disk is already formatted + if ! blkid "$DISK_DEVICE" > /dev/null 2>&1; then + echo "[$(date)] - Formatting disk $DISK_DEVICE with ext4" + mkfs -t ext4 "$DISK_DEVICE" + else + echo "[$(date)] - Disk $DISK_DEVICE is already formatted" + fi + + # Create mount point and mount + mkdir -p /mnt/block_storage + mount "$DISK_DEVICE" /mnt/block_storage + + # Add to fstab for persistent mounting across reboots + DISK_UUID=$(blkid -s UUID -o value "$DISK_DEVICE") + if [ -n "$DISK_UUID" ] && ! grep -q "$DISK_UUID" /etc/fstab; then + echo "UUID=$DISK_UUID /mnt/block_storage ext4 defaults 0 2" >> /etc/fstab + fi + + # Set proper ownership + chown debian:debian /mnt/block_storage + echo "[$(date)] - Persistent disk mounted at /mnt/block_storage" + else + echo "[$(date)] - No persistent disk found at $DISK_DEVICE. Galaxy will use ephemeral storage." + fi + + # 2. Setup PostgreSQL disk if available + POSTGRES_DISK_DEVICE="/dev/disk/by-id/google-galaxy-postgres-data" + if [ -b "$POSTGRES_DISK_DEVICE" ]; then + echo "[$(date)] - Found PostgreSQL disk at $POSTGRES_DISK_DEVICE" + + # Check if disk is already formatted + if ! blkid "$POSTGRES_DISK_DEVICE" > /dev/null 2>&1; then + echo "[$(date)] - Formatting PostgreSQL disk $POSTGRES_DISK_DEVICE with ext4" + mkfs -t ext4 "$POSTGRES_DISK_DEVICE" + else + echo "[$(date)] - PostgreSQL disk $POSTGRES_DISK_DEVICE is already formatted" + fi + + # Create mount point and mount + mkdir -p /mnt/postgres_storage + mount "$POSTGRES_DISK_DEVICE" /mnt/postgres_storage + + # Add to fstab for persistent mounting across reboots + POSTGRES_DISK_UUID=$(blkid -s UUID -o value "$POSTGRES_DISK_DEVICE") + if [ -n "$POSTGRES_DISK_UUID" ] && ! grep -q "$POSTGRES_DISK_UUID" /etc/fstab; then + echo "UUID=$POSTGRES_DISK_UUID /mnt/postgres_storage ext4 defaults 0 2" >> /etc/fstab + fi + + # Set proper ownership + chown debian:debian /mnt/postgres_storage + echo "[$(date)] - PostgreSQL disk mounted at /mnt/postgres_storage" + else + echo "[$(date)] - No PostgreSQL disk found at $POSTGRES_DISK_DEVICE. PostgreSQL will use ephemeral storage." + fi + + # 3. Run ansible-pull + sudo -u debian bash -c ' + export HOME=/home/debian + HOST_IP=$(curl -s ifconfig.me) + + PV_SIZE=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/persistent-volume-size" -H "Metadata-Flavor: Google" 2>/dev/null) + if [ -z "$PV_SIZE" ]; then + echo "[$(date)] - persistent-volume-size metadata not found or empty, using default." + PV_SIZE="139Gi" + fi + echo "[$(date)] - NFS storage size for Galaxy: ${PV_SIZE}" + + # Add restore_galaxy if enabled + RESTORE_GALAXY=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/restore_galaxy" -H "Metadata-Flavor: Google" 2>/dev/null || echo "false") + + GCP_BATCH_SERVICE_ACCOUNT_EMAIL=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/gcp_batch_service_account_email" -H "Metadata-Flavor: Google" 2>/dev/null || echo "galaxy-batch-runner@anvil-and-terra-development.iam.gserviceaccount.com") + echo "[$(date)] - GCP Batch service account email: ${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}" + + # Leo proxy path prefix for this Galaxy app (e.g. /proxy/google/v1/apps/{project}/{appName}/galaxy). + # Passed to ansible as galaxy_url_prefix so Galaxy generates correct absolute links (JS/CSS/API) + # that include the full proxy path. Without this Galaxy emits links rooted at / which the + # browser resolves against Leo's host and gets 404s → blank page. + GALAXY_URL_PREFIX=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/galaxy-url-prefix" -H "Metadata-Flavor: Google" 2>/dev/null || echo "") + echo "[$(date)] - Galaxy URL prefix: ${GALAXY_URL_PREFIX}" + + GIT_REPO=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-repo" -H "Metadata-Flavor: Google" 2>/dev/null || echo "https://github.com/galaxyproject/galaxy-k8s-boot.git") + GIT_BRANCH=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-branch" -H "Metadata-Flavor: Google" 2>/dev/null || echo "master") + + PULL_ARGS=( + -U "${GIT_REPO}" + -C "${GIT_BRANCH}" + -d /home/debian/ansible + -i /tmp/ansible-inventory/localhost + --accept-host-key + --limit 127.0.0.1 + --extra-vars "gcp_batch_service_account_email=${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}" + ) + + if [ "$RESTORE_GALAXY" = "true" ]; then + PULL_ARGS+=(--extra-vars "restore_galaxy=true") + echo "[$(date)] - Galaxy Restore Mode: Enabled" + else + echo "[$(date)] - Galaxy Restore Mode: Disabled" + fi + + if [ -n "$GALAXY_URL_PREFIX" ]; then + PULL_ARGS+=(--extra-vars "galaxy_prefix=${GALAXY_URL_PREFIX}") + echo "[$(date)] - Galaxy URL prefix passed to ansible: ${GALAXY_URL_PREFIX}" + fi + + PULL_ARGS+=(playbook.yml) + + mkdir -p /tmp/ansible-inventory + cat > /tmp/ansible-inventory/localhost << EOF + [vm] + 127.0.0.1 ansible_connection=local ansible_python_interpreter="/usr/bin/python3" + + [all:vars] + ansible_user="debian" + rke2_token="defaultSecret12345" + rke2_additional_sans=["${HOST_IP}"] + rke2_debug=true + nfs_size="${PV_SIZE}" + galaxy_persistence_size="${PV_SIZE}" + galaxy_db_password="gxy-db-password" + galaxy_user="dev@galaxyproject.org" + EOF + + echo "[$(date)] - Inventory file created at /tmp/ansible-inventory/localhost; running ansible-pull..." + echo "[$(date)] - Running: ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull ${PULL_ARGS[@]}" + + ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull "${PULL_ARGS[@]}" + ' + + echo "[$(date)] - Bootstrap script completed." + +runcmd: + - /usr/local/bin/galaxy_bootstrap.sh diff --git a/http/src/main/resources/init-resources/startup.sh b/http/src/main/resources/init-resources/startup.sh index 6375f7d7a0..4701c3a892 100644 --- a/http/src/main/resources/init-resources/startup.sh +++ b/http/src/main/resources/init-resources/startup.sh @@ -122,6 +122,10 @@ function failScriptIfError() { function validateCert() { certFileDirectory=$1 + ## Only the master node has certs; worker nodes in multi-node Dataproc clusters do not. + if [ ! -f "${certFileDirectory}/jupyter-server.crt" ]; then + return 0 + fi ## This helps when we need to rotate certs. notAfter=`openssl x509 -enddate -noout -in ${certFileDirectory}/jupyter-server.crt` # output should be something like `notAfter=Jul 4 20:31:52 2026 GMT` diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf index 06d866b7fb..d4ea61e98c 100644 --- a/http/src/main/resources/reference.conf +++ b/http/src/main/resources/reference.conf @@ -210,6 +210,44 @@ vpc { northamerica-northeast2 = "10.26.0.0/20" } firewallsToAdd = [ + # Allows Galaxy VM traffic on port 80 (nginx ingress on hostNetwork) + { + name-prefix = "leonardo-allow-http" + sourceRanges = { + us-central1 = ["0.0.0.0/0"] + northamerica-northeast1 = ["0.0.0.0/0"] + southamerica-east1 = ["0.0.0.0/0"] + us-east1 = ["0.0.0.0/0"] + us-east4 = ["0.0.0.0/0"] + us-west1 = ["0.0.0.0/0"] + us-west2 = ["0.0.0.0/0"] + us-west3 = ["0.0.0.0/0"] + us-west4 = ["0.0.0.0/0"] + europe-central2 = ["0.0.0.0/0"] + europe-north1 = ["0.0.0.0/0"] + europe-west1 = ["0.0.0.0/0"] + europe-west2 = ["0.0.0.0/0"] + europe-west3 = ["0.0.0.0/0"] + europe-west4 = ["0.0.0.0/0"] + europe-west6 = ["0.0.0.0/0"] + asia-east1 = ["0.0.0.0/0"] + asia-east2 = ["0.0.0.0/0"] + asia-northeast1 = ["0.0.0.0/0"] + asia-northeast2 = ["0.0.0.0/0"] + asia-northeast3 = ["0.0.0.0/0"] + asia-south1 = ["0.0.0.0/0"] + asia-southeast1 = ["0.0.0.0/0"] + asia-southeast2 = ["0.0.0.0/0"] + australia-southeast1 = ["0.0.0.0/0"] + northamerica-northeast2 = ["0.0.0.0/0"] + } + allowed = [ + { + protocol = "tcp" + port = "80" + } + ] + }, # Allows Leonardo proxy traffic on port 443 { name-prefix = "leonardo-allow-https" @@ -395,6 +433,21 @@ groups { } } +galaxyVm { + # Pre-built galaxy-k8s-boot image with all dependencies (Ansible, RKE2, etc.) pre-installed. + # Has cloud-init, which processes the "user-data" metadata key on first boot. + # Source: https://github.com/galaxyproject/galaxy-k8s-boot (image built by the Galaxy team) + sourceImage = "projects/anvil-and-terra-development/global/images/galaxy-k8s-boot-v2026-02-25" + machineType = "n1-highmem-8" + bootDiskSizeGb = 100 + postgresDiskSizeGb = 10 + # Suffix appended to the NFS disk name to derive the postgres disk name. + # Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix). + postgresDiskNameSuffix = ${gke.galaxyDisk.postgresDiskNameSuffix} + gitRepo = "https://github.com/galaxyproject/galaxy-k8s-boot.git" + gitBranch = "master" +} + gke { cluster { location = "us-central1-a", diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala index 45a3e26f38..08eb73020d 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala @@ -131,6 +131,18 @@ object Config { ) } + implicit private val galaxyVmConfigReader: ValueReader[GalaxyVmConfig] = ValueReader.relative { config => + GalaxyVmConfig( + config.as[GceCustomImage]("sourceImage"), + config.as[MachineTypeName]("machineType"), + config.as[DiskSize]("bootDiskSizeGb"), + config.as[DiskSize]("postgresDiskSizeGb"), + config.as[String]("postgresDiskNameSuffix"), + config.as[String]("gitRepo"), + config.as[String]("gitBranch") + ) + } + implicit private val gceConfigReader: ValueReader[GceConfig] = ValueReader.relative { config => GceConfig( config.as[GceCustomImage]("customGceImage"), @@ -497,6 +509,7 @@ object Config { val googleGroupsConfig = config.as[GoogleGroupsConfig]("groups") val dataprocConfig = config.as[DataprocConfig]("dataproc") + val galaxyVmConfig = config.as[GalaxyVmConfig]("galaxyVm") val gceConfig = config.as[GceConfig]("gce") val imageConfig = config.as[ImageConfig]("image") val prometheusConfig = config.as[PrometheusConfig]("prometheus") @@ -901,13 +914,13 @@ object Config { vpcConfig.networkTag, org.broadinstitute.dsde.workbench.leonardo.http.ConfigReader.appConfig.terraAppSetupChart, gkeIngressConfig, - gkeGalaxyAppConfig, gkeCromwellAppConfig, gkeCustomAppConfig, gkeAllowedAppConfig, appMonitorConfig, gkeClusterConfig, proxyConfig, - gkeGalaxyDiskConfig + gkeGalaxyDiskConfig, + galaxyVmConfig ) } diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala index 2b3fc4a587..bd6712279f 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala @@ -1,7 +1,9 @@ package org.broadinstitute.dsde.workbench.leonardo.config -import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.ServiceAccountName +import org.broadinstitute.dsde.workbench.google2.{KubernetesSerializableName, MachineTypeName} +import KubernetesSerializableName.ServiceAccountName import org.broadinstitute.dsde.workbench.leonardo.AppType._ +import org.broadinstitute.dsde.workbench.leonardo.CustomImage.GceCustomImage import org.broadinstitute.dsde.workbench.leonardo._ import org.broadinstitute.dsp.{ChartName, ChartVersion} @@ -92,6 +94,16 @@ final case class CustomAppConfig(chartName: ChartName, val appType: AppType = AppType.Custom } +final case class GalaxyVmConfig( + sourceImage: GceCustomImage, + machineType: MachineTypeName, + bootDiskSizeGb: DiskSize, + postgresDiskSizeGb: DiskSize, + postgresDiskNameSuffix: String, + gitRepo: String, + gitBranch: String +) + final case class ContainerRegistryUsername(asString: String) extends AnyVal final case class ContainerRegistryPassword(asString: String) extends AnyVal final case class ContainerRegistryCredentials(username: ContainerRegistryUsername, password: ContainerRegistryPassword) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala index 84a04b45da..7fc78868ef 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala @@ -23,7 +23,7 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client: traceId: TraceId ): F[Boolean] = Proxy.getAppTargetHost[F](kubernetesDnsCache, CloudContext.Gcp(googleProject), appName) flatMap { - case HostReady(targetHost, _, _) => + case HostReady(targetHost, _, _, _) => val serviceUrl = serviceName match { case ServiceName("welder-service") => s"https://${targetHost.address}/proxy/google/v1/apps/${googleProject.value}/${appName.value}/${serviceName.value}/status/" @@ -44,6 +44,20 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client: ) case _ => Async[F].pure(false) // Update once we support Relay for apps } + + def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean] = + client + .status( + Request[F]( + method = Method.GET, + uri = Uri.unsafeFromString(s"http://${ip.asString}:${port}/"), + headers = Headers(Header.Raw(CIString("X-Request-ID"), traceId.asString)) + ) + ) + .map(status => status.code < 500) + .handleErrorWith(t => + logger.error(Map("traceId" -> traceId.asString), t)("Fail to check if VM is reachable").as(false) + ) } trait AppDAO[F[_]] { @@ -52,4 +66,6 @@ trait AppDAO[F[_]] { serviceName: ServiceName, traceId: TraceId ): F[Boolean] + + def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean] } diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala index 9bb06e9548..4d09a08109 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala @@ -36,7 +36,9 @@ class HttpJupyterDAO[F[_]](val runtimeDnsCache: RuntimeDnsCache[F], client: Clie headers = Headers.empty ) ) - .handleError(_ => false) + .handleErrorWith(e => + logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false) + ) case _ => F.pure(false) } } yield res diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala index 6f6251a07d..ee1b07ddb2 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala @@ -11,7 +11,10 @@ object HostStatus { final case object HostNotFound extends HostStatus final case object HostNotReady extends HostStatus final case object HostPaused extends HostStatus - final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider) extends HostStatus { + // useHttp = true means the proxy connects to the backend via plain HTTP (port 80) instead of + // HTTPS (proxyConfig.proxyPort). Used for Galaxy VM apps whose nginx serves HTTP only. + final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider, useHttp: Boolean = false) + extends HostStatus { def toUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/proxy/${path}") def toNotebooksUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/notebooks/${path}") diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala index 6f7ff4783c..a0bc7196fb 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala @@ -7,6 +7,7 @@ import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus.{HostNotFound, HostNotReady, HostReady} import org.broadinstitute.dsde.workbench.leonardo.db.{DbReference, KubernetesServiceDbQueries} import org.broadinstitute.dsde.workbench.leonardo.http.{kubernetesProxyHost, GetAppResult} +import org.broadinstitute.dsde.workbench.leonardo.AppType.Galaxy import org.broadinstitute.dsde.workbench.leonardo.{AppName, CloudContext, CloudProvider} import org.broadinstitute.dsde.workbench.model.IP import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics @@ -48,8 +49,14 @@ final class KubernetesDnsCache[F[_]: Logger: OpenTelemetryMetrics]( case None => F.pure[HostStatus](HostNotReady) case Some(ip) => val h = kubernetesProxyHost(appResult.cluster, proxyConfig.proxyDomain) + // Galaxy VM apps serve HTTP on port 80. The proxy should connect via plain HTTP, + // and we map the fake hostname to the VM's external IP (stored in loadBalancerIp). + // External IP is used because Leo's pod is in a different VPC from the user's workspace project. + val isGalaxyVm = appResult.app.appType == Galaxy hostToIpMapping .getAndUpdate(_ + (h.address -> ip)) - .as[HostStatus](HostReady(h, "", CloudProvider.Gcp)) // TODO: update this once we start support AKS + .as[HostStatus]( + HostReady(h, "", CloudProvider.Gcp, useHttp = isGalaxyVm) + ) // TODO: update this once we start support AKS } } diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala index 5d989946bd..5f407ad424 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala @@ -256,7 +256,7 @@ class ProxyService( hostStatus <- getRuntimeTargetHost(cloudContext, runtimeName) _ <- hostStatus match { - case HostReady(_, _, _) => + case HostReady(_, _, _, _) => dateAccessUpdaterQueue.offer( UpdateDateAccessedMessage(UpdateTarget.Runtime(runtimeName), cloudContext, ctx.now) ) @@ -350,11 +350,15 @@ class ProxyService( } else IO.unit hostStatus <- getAppTargetHost(cloudContext, appName) _ <- hostStatus match { - case HostReady(_, _, _) => + case HostReady(_, _, _, _) => dateAccessUpdaterQueue.offer(UpdateDateAccessedMessage(UpdateTarget.App(appName), cloudContext, ctx.now)) case _ => IO.unit } hostContext = HostContext(hostStatus, s"${cloudContext.asString}/${appName.value}/${serviceName.value}") + // Galaxy VM apps: forward the full Leo proxy path to the VM unchanged. + // galaxy-k8s-boot configures nginx location blocks and galaxy_url_prefix + // using the ingress.path value (= the Leo proxy prefix), so the VM expects + // to receive the full path including the prefix. r <- proxyInternal(hostContext, request) appType <- appQuery.getAppType(appName).transaction result = if (r.status.isSuccess()) "success" else "failure" @@ -376,16 +380,16 @@ class ProxyService( for { ctx <- ev.ask[AppContext] res <- hostContext.status match { - case HostReady(targetHost, _, _) => + case HostReady(targetHost, _, _, useHttp) => // If this is a WebSocket request (e.g. wss://leo:8080/...) then akka-http injects a // virtual UpgradeToWebSocket header which contains facilities to handle the WebSocket data. // The presence of this header distinguishes WebSocket from http requests. val res = for { response <- request.attribute(AttributeKeys.webSocketUpgrade) match { case Some(upgrade) => - IO.fromFuture(IO(handleWebSocketRequest(targetHost, request, upgrade))) + IO.fromFuture(IO(handleWebSocketRequest(targetHost, request, upgrade, useHttp))) case None => - IO.fromFuture(IO(handleHttpRequest(targetHost, request))) + IO.fromFuture(IO(handleHttpRequest(targetHost, request, useHttp))) } r <- if (response.status.isFailure()) @@ -418,9 +422,10 @@ class ProxyService( } } yield res - private def handleHttpRequest(targetHost: Host, request: HttpRequest): Future[HttpResponse] = { - logger.debug(s"Opening https connection to ${targetHost.address}:${proxyConfig.proxyPort}") - + private def handleHttpRequest(targetHost: Host, + request: HttpRequest, + useHttp: Boolean = false + ): Future[HttpResponse] = { // A note on akka-http philosophy: // The Akka HTTP server is implemented on top of Streams and makes heavy use of it. Requests come // in as a Source[HttpRequest] and responses are returned as a Sink[HttpResponse]. The transformation @@ -429,12 +434,24 @@ class ProxyService( // Initializes a Flow representing a prospective connection to the given endpoint. The connection // is not made until a Source and Sink are plugged into the Flow (i.e. it is materialized). - val flow = Http() - .connectionTo(targetHost.address) - .toPort(proxyConfig.proxyPort) - .withCustomHttpsConnectionContext(httpsConnectionContext) - .withClientConnectionSettings(clientConnectionSettings) - .https() + // Galaxy VM apps use plain HTTP on port 80; all other backends use HTTPS on proxyConfig.proxyPort. + val flow = + if (useHttp) { + logger.debug(s"Opening http connection to ${targetHost.address}:80") + Http() + .connectionTo(targetHost.address) + .toPort(80) + .withClientConnectionSettings(clientConnectionSettings) + .http() + } else { + logger.debug(s"Opening https connection to ${targetHost.address}:${proxyConfig.proxyPort}") + Http() + .connectionTo(targetHost.address) + .toPort(proxyConfig.proxyPort) + .withCustomHttpsConnectionContext(httpsConnectionContext) + .withClientConnectionSettings(clientConnectionSettings) + .https() + } // Now build a Source[Request] out of the original HttpRequest. We need to make some modifications // to the original request in order for the proxy to work: @@ -492,7 +509,8 @@ class ProxyService( private def handleWebSocketRequest(targetHost: Host, request: HttpRequest, - upgrade: WebSocketUpgrade + upgrade: WebSocketUpgrade, + useHttp: Boolean = false ): Future[HttpResponse] = { logger.info(s"Opening websocket connection to ${targetHost.address}") @@ -509,19 +527,35 @@ class ProxyService( // Make a single WebSocketRequest to the notebook server, passing in our Flow. This returns a Future[WebSocketUpgradeResponse]. // Keep our publisher/subscriber (e.g. sink/source) for use later. These are returned because we specified Keep.both above. // Note that we are rewriting the paths for any requests that are routed to /proxy/*/*/jupyter/ - val (responseFuture, (publisher, subscriber)) = Http().singleWebSocketRequest( - WebSocketRequest( - request.uri.copy(path = rewriteJupyterPath(request.uri.path), - authority = request.uri.authority.copy(host = targetHost, port = proxyConfig.proxyPort), - scheme = "wss" - ), - extraHeaders = filterHeaders(request.headers), - upgrade.requestedProtocols.headOption - ), - flow, - httpsConnectionContext, - settings = clientConnectionSettings - ) + // Galaxy VM apps use ws:// on port 80; all other backends use wss:// on proxyConfig.proxyPort. + val (responseFuture, (publisher, subscriber)) = + if (useHttp) + Http().singleWebSocketRequest( + WebSocketRequest( + request.uri.copy(path = rewriteJupyterPath(request.uri.path), + authority = request.uri.authority.copy(host = targetHost, port = 80), + scheme = "ws" + ), + extraHeaders = filterHeaders(request.headers), + upgrade.requestedProtocols.headOption + ), + flow, + settings = clientConnectionSettings + ) + else + Http().singleWebSocketRequest( + WebSocketRequest( + request.uri.copy(path = rewriteJupyterPath(request.uri.path), + authority = request.uri.authority.copy(host = targetHost, port = proxyConfig.proxyPort), + scheme = "wss" + ), + extraHeaders = filterHeaders(request.headers), + upgrade.requestedProtocols.headOption + ), + flow, + httpsConnectionContext, + settings = clientConnectionSettings + ) // If we got a valid WebSocketUpgradeResponse, call handleMessages with our publisher/subscriber, which are // already materialized from the HttpRequest. diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala index 8c33c93579..fb71a5603d 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala @@ -980,9 +980,14 @@ class LeoPubsubMessageSubscriber[F[_]]( ) .void - // create second Galaxy disk asynchronously + // Restore mode: disk already exists (no new disk to create) for a Galaxy app. + // In this case we skip creating new disks and pass restore=true so the VM + // runs the Ansible restore playbook instead of a fresh install. + restore = msg.appType == AppType.Galaxy && msg.createDisk.isEmpty + + // create second Galaxy disk asynchronously (only for fresh installs) createSecondDiskOp = - if (msg.appType == AppType.Galaxy && disk.isDefined) { + if (msg.appType == AppType.Galaxy && disk.isDefined && !restore) { val d = disk.get // it's safe to do `.get` here because we've verified for { res <- createGalaxyPostgresDiskOnlyInGoogle(msg.project, ZoneName("us-central1-a"), msg.appName, d.name) @@ -999,15 +1004,20 @@ class LeoPubsubMessageSubscriber[F[_]]( } yield res } else F.unit + // Galaxy uses VM-based deployment: no GKE cluster or nodepool is created. + // The VM is launched inside createAndPollApp instead. + effectiveClusterOrNodepoolOp = + if (msg.appType == AppType.Galaxy) F.unit else createClusterOrNodepoolOp + // build asynchronous task task = for { // parallelize disk creation and cluster/nodepool monitoring - _ <- List(createDiskOp, createSecondDiskOp, createClusterOrNodepoolOp).parSequence_ + _ <- List(createDiskOp, createSecondDiskOp, effectiveClusterOrNodepoolOp).parSequence_ // create and monitor app _ <- getGkeAlgFromRegistry() .createAndPollApp( - CreateAppParams(msg.appId, msg.project, msg.appName, msg.machineType, msg.bucketNameToMount) + CreateAppParams(msg.appId, msg.project, msg.appName, msg.machineType, msg.bucketNameToMount, restore) ) .onError { case e => cleanUpAfterCreateAppError(msg.appId, msg.appName, msg.project, msg.createDisk, e) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala index 7fa725c8cf..3c54efd044 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala @@ -1,10 +1,8 @@ package org.broadinstitute.dsde.workbench.leonardo package util -import org.broadinstitute.dsde.workbench.google2.DiskName import org.broadinstitute.dsde.workbench.google2.GKEModels.NodepoolName import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.{NamespaceName, ServiceAccountName} -import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore import org.broadinstitute.dsde.workbench.leonardo.Autopilot import org.broadinstitute.dsde.workbench.leonardo.dao.CustomAppService import org.broadinstitute.dsde.workbench.leonardo.http.kubernetesProxyHost @@ -15,111 +13,6 @@ import org.broadinstitute.dsp.Release import java.nio.charset.StandardCharsets private[leonardo] object BuildHelmChartValues { - def buildGalaxyChartOverrideValuesString(config: GKEInterpreterConfig, - appName: AppName, - release: Release, - cluster: KubernetesCluster, - nodepoolName: NodepoolName, - userEmail: WorkbenchEmail, - customEnvironmentVariables: Map[String, String], - ksa: ServiceAccountName, - namespaceName: NamespaceName, - nfsDisk: PersistentDisk, - postgresDiskName: DiskName, - machineType: AppMachineType, - galaxyRestore: Option[GalaxyRestore] - ): List[String] = { - val k8sProxyHost = kubernetesProxyHost(cluster, config.proxyConfig.proxyDomain).address - val leoProxyhost = config.proxyConfig.getProxyServerHostName - val ingressPath = s"/proxy/google/v1/apps/${cluster.cloudContext.asString}/${appName.value}/galaxy" - val workspaceName = customEnvironmentVariables.getOrElse("WORKSPACE_NAME", "") - val workspaceNamespace = customEnvironmentVariables.getOrElse("WORKSPACE_NAMESPACE", "") - // Machine type info - val maxLimitMemory = machineType.memorySizeInGb - val maxLimitCpu = machineType.numOfCpus - val maxRequestMemory = maxLimitMemory - 22 - val maxRequestCpu = maxLimitCpu - 6 - - // Custom EV configs - val configs = customEnvironmentVariables.toList.zipWithIndex.flatMap { case ((k, v), i) => - List( - raw"""configs.$k=$v""", - raw"""extraEnv[$i].name=$k""", - raw"""extraEnv[$i].valueFrom.configMapKeyRef.name=${release.asString}-galaxykubeman-configs""", - raw"""extraEnv[$i].valueFrom.configMapKeyRef.key=$k""" - ) - } - - val galaxyRestoreSettings = galaxyRestore.fold(List.empty[String])(g => - List( - raw"""restore.persistence.nfs.galaxy.pvcID=${g.galaxyPvcId.asString}""", - raw"""galaxy.persistence.existingClaim=${release.asString}-galaxy-galaxy-pvc""" - ) - ) - // Using the string interpolator raw""" since the chart keys include quotes to escape Helm - // value override special characters such as '.' - // https://helm.sh/docs/intro/using_helm/#the-format-and-limitations-of---set - List( - // Storage class configs - raw"""nfs.storageClass.name=nfs-${release.asString}""", - raw"""galaxy.persistence.storageClass=nfs-${release.asString}""", - // Node selector config: this ensures the app is run on the user's nodepool - raw"""galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""", - raw"""nfs.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""", - raw"""galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: ${nodepoolName.value}""", - raw"""galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""", - // Ingress configs - raw"""galaxy.ingress.path=${ingressPath}""", - raw"""galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://${k8sProxyHost}""", - raw"""galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=${leoProxyhost}""", - raw"""galaxy.ingress.hosts[0].host=${k8sProxyHost}""", - raw"""galaxy.ingress.hosts[0].paths[0].path=${ingressPath}""", - raw"""galaxy.ingress.tls[0].hosts[0]=${k8sProxyHost}""", - raw"""galaxy.ingress.tls[0].secretName=tls-secret""", - // CVMFS configs - raw"""cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-${release.asString}""", - raw"""cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache""", - // Galaxy configs - raw"""galaxy.configs.galaxy\.yml.galaxy.single_user=${userEmail.value}""", - raw"""galaxy.configs.galaxy\.yml.galaxy.admin_users=${userEmail.value}""", - raw"""galaxy.terra.launch.workspace=${workspaceName}""", - raw"""galaxy.terra.launch.namespace=${workspaceNamespace}""", - raw"""galaxy.terra.launch.apiURL=${config.galaxyAppConfig.orchUrl.value}""", - raw"""galaxy.terra.launch.drsURL=${config.galaxyAppConfig.drsUrl.value}""", - // Tusd ingress configs - raw"""galaxy.tusd.ingress.hosts[0].host=${k8sProxyHost}""", - raw"""galaxy.tusd.ingress.hosts[0].paths[0].path=${ingressPath}/api/upload/resumable_upload""", - raw"""galaxy.tusd.ingress.tls[0].hosts[0]=${k8sProxyHost}""", - raw"""galaxy.tusd.ingress.tls[0].secretName=tls-secret""", - // Set RabbitMQ storage class - raw"""galaxy.rabbitmq.persistence.storageClassName=nfs-${release.asString}""", - // Set Machine Type specs - raw"""galaxy.jobs.maxLimits.memory=${maxLimitMemory}""", - raw"""galaxy.jobs.maxLimits.cpu=${maxLimitCpu}""", - raw"""galaxy.jobs.maxRequests.memory=${maxRequestMemory}""", - raw"""galaxy.jobs.maxRequests.cpu=${maxRequestCpu}""", - raw"""galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=${maxRequestMemory}""", - raw"""galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=${maxRequestCpu}""", - // RBAC configs - raw"""galaxy.serviceAccount.create=false""", - raw"""galaxy.serviceAccount.name=${ksa.value}""", - raw"""rbac.serviceAccount=${ksa.value}""", - // Persistence configs - raw"""persistence.nfs.name=${namespaceName.value}-${config.galaxyDiskConfig.nfsPersistenceName}""", - raw"""persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=${nfsDisk.name.value}""", - raw"""persistence.nfs.size=${nfsDisk.size.gb.toString}Gi""", - raw"""persistence.postgres.name=${namespaceName.value}-${config.galaxyDiskConfig.postgresPersistenceName}""", - raw"""galaxy.postgresql.galaxyDatabasePassword=${config.galaxyAppConfig.postgresPassword.value}""", - raw"""persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=${postgresDiskName.value}""", - raw"""persistence.postgres.size=${config.galaxyDiskConfig.postgresDiskSizeGB.gb.toString}Gi""", - raw"""nfs.persistence.existingClaim=${namespaceName.value}-${config.galaxyDiskConfig.nfsPersistenceName}-pvc""", - raw"""nfs.persistence.size=${nfsDisk.size.gb.toString}Gi""", - raw"""galaxy.postgresql.persistence.existingClaim=${namespaceName.value}-${config.galaxyDiskConfig.postgresPersistenceName}-pvc""", - // Note Galaxy pvc claim is the nfs disk size minus 50G - raw"""galaxy.persistence.size=${(nfsDisk.size.gb - 50).toString}Gi""" - ) ++ configs ++ galaxyRestoreSettings - } - def buildCromwellAppChartOverrideValuesString(config: GKEInterpreterConfig, appName: AppName, cluster: KubernetesCluster, diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala index 22108113e1..9a81c53064 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala @@ -125,7 +125,8 @@ final case class CreateAppParams(appId: AppId, googleProject: GoogleProject, appName: AppName, appMachineType: Option[AppMachineType], - bucketNameToMount: Option[GcsBucketName] + bucketNameToMount: Option[GcsBucketName], + restore: Boolean = false ) final case class DeleteClusterParams(clusterId: KubernetesClusterLeoId, googleProject: GoogleProject) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index f3d609f228..1901685b67 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -6,7 +6,19 @@ import cats.effect.Async import cats.mtl.Ask import cats.syntax.all._ import com.google.auth.oauth2.GoogleCredentials -import com.google.cloud.compute.v1.Disk +import com.google.cloud.compute.v1.{ + AccessConfig, + Allowed, + AttachedDisk, + AttachedDiskInitializeParams, + Firewall, + Instance, + Items, + Metadata, + NetworkInterface, + ServiceAccount, + Tags +} import com.google.container.v1._ import fs2.io.file.Files import org.broadinstitute.dsde.workbench.DoneCheckable @@ -27,28 +39,31 @@ import org.broadinstitute.dsde.workbench.google2.{ streamFUntilDone, streamUntilDoneOrTimeout, tracedRetryF, - DiskName, + FirewallRuleName, GoogleComputeService, GoogleDiskService, GoogleResourceService, KubernetesClusterNotFoundException, - PvName, + NetworkName, + RegionName, + SubnetworkName, ZoneName } +import org.broadinstitute.dsde.workbench.util2.InstanceName import org.broadinstitute.dsde.workbench.leonardo.dao.{AppDAO, AppDescriptorDAO} import org.broadinstitute.dsde.workbench.leonardo.db._ import org.broadinstitute.dsde.workbench.leonardo.http._ import org.broadinstitute.dsde.workbench.leonardo.http.service.AppNotFoundException +import org.broadinstitute.dsde.workbench.leonardo.dao.google.{buildMachineTypeUri, buildSubnetworkUri} import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{ buildAllowedAppChartOverrideValuesString, buildCromwellAppChartOverrideValuesString, - buildCustomChartOverrideValuesString, - buildGalaxyChartOverrideValuesString + buildCustomChartOverrideValuesString } import org.broadinstitute.dsde.workbench.leonardo.model.LeoException -import org.broadinstitute.dsde.workbench.leonardo.monitor.PubsubHandleMessageError.PubsubKubernetesError import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._ -import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject} +import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject, ServiceAccountDisplayName} +import org.broadinstitute.dsde.workbench.model.google.iam.IamMemberTypes import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail} import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics import org.broadinstitute.dsp._ @@ -387,21 +402,62 @@ class GKEInterpreter[F[_]]( ) ) app = dbApp.app - namespaceName = app.appResources.namespace dbCluster = dbApp.cluster - gkeClusterId = dbCluster.getClusterId googleProject = params.googleProject + // Idempotency: if the app is already Running, skip creation to avoid double-recording usage + _ <- + if (app.status == AppStatus.Running) + logger.info(ctx.loggingCtx)( + s"App ${app.appName.value} is already Running, skipping creation (idempotent)" + ) + else + for { + diskOpt <- appQuery.getDiskId(app.id).transaction + diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId)) - // TODO: This DB query might not be needed if it makes sense to add diskId in App model (will revisit in next PR) - diskOpt <- appQuery.getDiskId(app.id).transaction - diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId)) + _ <- logger.info(ctx.loggingCtx)(s"Begin App(${app.appName.value}) Creation.") - // Create namespace and secrets - _ <- logger.info(ctx.loggingCtx)( - s"Begin App(${app.appName.value}) Creation." - ) + nfsDisk <- F.fromOption( + dbApp.app.appResources.disk, + AppCreationException( + s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}" + ) + ) - // Create KSA + // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path. + _ <- app.appType match { + case AppType.Galaxy => + installGalaxyVm(dbCluster, app, nfsDisk, googleProject, params.restore) >> + persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void + + case _ => + createAndPollAppViaHelm(params, dbApp, app, dbCluster, nfsDisk, diskId, googleProject, ctx) + } + + _ <- logger.info(ctx.loggingCtx)( + s"Finished app creation for app ${app.appName.value}" + ) + + readyTime <- F.realTimeInstant + _ <- appUsageQuery.recordStart(params.appId, readyTime) + _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction + } yield () + } yield () + + // GKE/Helm path for non-Galaxy app types (Cromwell, Allowed, Custom). + private def createAndPollAppViaHelm( + params: CreateAppParams, + dbApp: GetAppResult, + app: App, + dbCluster: KubernetesCluster, + nfsDisk: PersistentDisk, + diskId: DiskId, + googleProject: GoogleProject, + ctx: AppContext + )(implicit ev: Ask[F, AppContext]): F[Unit] = { + val namespaceName = app.appResources.namespace + val gkeClusterId = dbCluster.getClusterId + for { ksaName <- F.fromOption( app.appResources.kubernetesServiceAccountName, AppCreationException( @@ -424,11 +480,6 @@ class GKEInterpreter[F[_]]( ) } - nfsDisk <- F.fromOption( - dbApp.app.appResources.disk, - AppCreationException(s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}") - ) - helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) _ <- helmClient @@ -442,12 +493,8 @@ class GKEInterpreter[F[_]]( true ) .run(helmAuthContext) - // update KSA in DB _ <- appQuery.updateKubernetesServiceAccount(app.id, ksaName).transaction - // Associate GSA to newly created KSA - // This string is constructed based on Google requirements to associate a GSA to a KSA - // (https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity#creating_a_relationship_between_ksas_and_gsas) ksaToGsa = s"${googleProject.value}.svc.id.goog[${namespaceName.value}/${ksaName.value}]" call = F.fromFuture( F.delay( @@ -458,49 +505,15 @@ class GKEInterpreter[F[_]]( ) ) ) - retryConfig = RetryPredicates.retryConfigWithPredicates( - when409 - ) + retryConfig = RetryPredicates.retryConfigWithPredicates(when409) _ <- tracedRetryF(retryConfig)( call, s"googleIamDAO.addIamPolicyBindingOnServiceAccount for GSA ${gsa.value} & KSA ${ksaName.value}" ).compile.lastOrError - // TODO: validate app release is the same as restore release - appRestore: Option[AppRestore] <- persistentDiskQuery.getAppDiskRestore(diskId).transaction - galaxyRestore: Option[AppRestore.GalaxyRestore] = appRestore.flatMap { - case a: AppRestore.GalaxyRestore => Some(a) - case _: AppRestore.Other => None - } - nodepool = if (app.autopilot.isDefined) None else Some(dbApp.nodepool.nodepoolName) - // helm install and wait + _ <- app.appType match { - case AppType.Galaxy => - for { - machineType <- F.fromOption( - params.appMachineType, - new LeoException( - s"can't find machine config for ${googleProject.value}/${app.appName.value}. This should never happen", - traceId = Some(ctx.traceId) - ) - ) - _ <- installGalaxy( - helmAuthContext, - app.appName, - app.release, - app.chart, - dbCluster, - dbApp.nodepool.nodepoolName, // https://broadworkbench.atlassian.net/browse/IA-4987 - namespaceName, - app.auditInfo.creator, - app.customEnvironmentVariables, - ksaName, - nfsDisk, - machineType, - galaxyRestore - ) - } yield () case AppType.Cromwell => installCromwellApp( helmAuthContext, @@ -551,60 +564,15 @@ class GKEInterpreter[F[_]]( F.raiseError(AppCreationException(s"App type ${app.appType} not supported on GCP")) } - _ <- logger.info(ctx.loggingCtx)( - s"Finished app creation for app ${app.appName.value} in cluster ${gkeClusterId.toString}" - ) - _ <- app.appType match { - case AppType.Galaxy => - if (galaxyRestore.isDefined) persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void - else - for { - pvcs <- kubeService.listPersistentVolumeClaims(gkeClusterId, - KubernetesNamespace(app.appResources.namespace) - ) - - _ <- pvcs - // We added an extra -galaxy here: https://github.com/galaxyproject/galaxykubeman-helm/blob/f7f27be74c213deda3ae53122[…]959c96480bb21f/galaxykubeman/templates/config-setup-galaxy.yaml - .find(pvc => pvc.getMetadata.getName == s"${app.release.asString}-galaxy-galaxy-pvc") - .fold( - F.raiseError[Unit]( - PubsubKubernetesError(AppError("Fail to retrieve pvc ids", - ctx.now, - ErrorAction.CreateApp, - ErrorSource.App, - None, - Some(ctx.traceId) - ), - Some(app.id), - false, - None, - None, - None - ) - ) - ) { galaxyPvc => - val galaxyDiskRestore = AppRestore.GalaxyRestore( - PvcId(galaxyPvc.getMetadata.getUid), - app.id - ) - persistentDiskQuery - .updateGalaxyDiskRestore(diskId, galaxyDiskRestore) - .transaction - .void - } - } yield () case AppType.Cromwell => persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction case AppType.Allowed => persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction case AppType.Custom => F.unit case _ => F.raiseError(AppCreationException(s"App type ${app.appType} not supported on GCP")) } - - readyTime <- F.realTimeInstant - _ <- appUsageQuery.recordStart(params.appId, readyTime) - _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction } yield () + } override def deleteAndPollCluster(params: DeleteClusterParams)(implicit ev: Ask[F, AppContext]): F[Unit] = for { @@ -725,77 +693,88 @@ class GKEInterpreter[F[_]]( // Resolve the cluster in Google googleClusterOpt <- gkeService.getCluster(gkeClusterId) - _ <- googleClusterOpt - .traverse { googleCluster => - val uninstallCharts = for { - helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) - - _ <- logger.info(ctx.loggingCtx)( - s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}" + _ <- app.appType match { + case AppType.Galaxy => + // Galaxy runs on a VM: delete the GCE instance. Disks are retained for persistence. + for { + gp <- F.fromOption( + LeoLenses.cloudContextToGoogleProject.get(dbCluster.cloudContext), + new RuntimeException("Galaxy app cloud context should be a google project") ) + instanceName = InstanceName(s"galaxy-${app.appName.value}") + zone = dbApp.app.appResources.disk.map(_.zone).getOrElse(ZoneName(config.clusterConfig.location.value)) + _ <- computeService + .deleteInstance(gp, zone, instanceName) + .void + .handleErrorWith { e => + logger.warn(ctx.loggingCtx)( + s"Failed to delete Galaxy VM ${instanceName.value}: ${e.getMessage}. Continuing with app deletion." + ) + } + // Mark the cluster DB record as deleted so future app creation in this project is not blocked. + // For Galaxy, the "cluster" is a pure DB abstraction (no real GKE cluster); it must be + // cleaned up here because no separate cluster-deletion pubsub message is sent. + _ <- kubernetesClusterQuery.markAsDeleted(dbCluster.id, ctx.now).transaction + } yield () - // helm uninstall the app chart and wait - _ <- helmClient - .uninstall(app.release, config.galaxyAppConfig.uninstallKeepHistory) - .run(helmAuthContext) + case _ => + // GKE/Helm path for all other app types + googleClusterOpt.traverse { googleCluster => + val uninstallCharts = for { + helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) - last <- streamFUntilDone( - kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)), - config.monitorConfig.deleteApp.maxAttempts, - config.monitorConfig.deleteApp.interval - ).compile.lastOrError + _ <- logger.info(ctx.loggingCtx)( + s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}" + ) - _ <- - if (!podDoneCheckable.isDone(last)) { - val msg = - s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last - .filterNot(isPodDone) - .map(_.name.value) - .mkString(", ")}" - logger.error(ctx.loggingCtx)(msg) >> - F.raiseError[Unit](AppDeletionException(msg)) - } else F.unit + _ <- helmClient + .uninstall(app.release, true) + .run(helmAuthContext) + + last <- streamFUntilDone( + kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)), + config.monitorConfig.deleteApp.maxAttempts, + config.monitorConfig.deleteApp.interval + ).compile.lastOrError + + _ <- + if (!podDoneCheckable.isDone(last)) { + val msg = + s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last + .filterNot(isPodDone) + .map(_.name.value) + .mkString(", ")}" + logger.error(ctx.loggingCtx)(msg) >> + F.raiseError[Unit](AppDeletionException(msg)) + } else F.unit + + _ <- helmClient + .uninstall(getTerraAppSetupChartReleaseName(app.release), true) + .run(helmAuthContext) + } yield () - // helm uninstall the setup chart - _ <- helmClient - .uninstall( - getTerraAppSetupChartReleaseName(app.release), - config.galaxyAppConfig.uninstallKeepHistory + uninstallCharts.handleErrorWith { e => + logger.info(ctx.loggingCtx)( + s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}" ) - .run(helmAuthContext) - } yield () - - uninstallCharts.handleErrorWith { e => - logger.info(ctx.loggingCtx)( - s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}" + } + }.void >> + kubeService + .deleteNamespace(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) >> + streamUntilDoneOrTimeout( + kubeService + .namespaceExists(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) + .map(!_), + 60, + 5 seconds, + "delete namespace timed out" ) - } - } - - // delete the namespace only after the helm uninstall completes - _ <- kubeService.deleteNamespace(dbApp.cluster.getClusterId, - KubernetesNamespace(dbApp.app.appResources.namespace) - ) - - fa = kubeService - .namespaceExists(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) - .map(!_) // mapping to inverse because booleanDoneCheckable defines `Done` when it becomes `true`...In this case, the namespace will exists for a while, and eventually becomes non-existent + } - _ <- streamUntilDoneOrTimeout(fa, 60, 5 seconds, "delete namespace timed out") _ <- logger.info(ctx.loggingCtx)( - s"Delete app operation has finished for app ${app.appName.value} in cluster ${gkeClusterId.toString}" + s"Delete app operation has finished for app ${app.appName.value}" ) - appRestore: Option[AppRestore.GalaxyRestore] = dbApp.app.appResources.disk.flatMap(_.appRestore).flatMap { - case a: AppRestore.GalaxyRestore => Some(a) - case _: AppRestore.Other => None - } - _ <- appRestore.traverse { restore => - for { - _ <- kubeService.deletePv(dbCluster.getClusterId, PvName(s"pvc-${restore.galaxyPvcId.asString}")) - } yield () - } - _ <- if (!params.errorAfterDelete) { F.unit @@ -1016,29 +995,6 @@ class GKEInterpreter[F[_]]( } } yield () - private[leonardo] def getGalaxyPostgresDisk(diskName: DiskName, - namespaceName: NamespaceName, - project: GoogleProject, - zone: ZoneName - )(implicit traceId: Ask[F, AppContext]): F[Option[Disk]] = - for { - postgresDiskOpt <- googleDiskService - .getDisk( - project, - zone, - getGalaxyPostgresDiskName(diskName, config.galaxyDiskConfig.postgresDiskNameSuffix) - ) - res <- postgresDiskOpt match { - case Some(disk) => F.pure(Some(disk)) - case None => - googleDiskService.getDisk( - project, - zone, - getOldStyleGalaxyPostgresDiskName(namespaceName, config.galaxyDiskConfig.postgresDiskNameSuffix) - ) - } - } yield res - private[util] def installNginx(dbCluster: KubernetesCluster, googleCluster: Cluster)(implicit ev: Ask[F, AppContext] ): F[IP] = @@ -1085,94 +1041,325 @@ class GKEInterpreter[F[_]]( ) } yield loadBalancerIp - private[util] def installGalaxy(helmAuthContext: AuthContext, - appName: AppName, - release: Release, - chart: Chart, - dbCluster: KubernetesCluster, - nodepoolName: NodepoolName, - namespaceName: NamespaceName, - userEmail: WorkbenchEmail, - customEnvironmentVariables: Map[String, String], - kubernetesServiceAccount: ServiceAccountName, - nfsDisk: PersistentDisk, - machineType: AppMachineType, - galaxyRestore: Option[AppRestore.GalaxyRestore] - )(implicit - ev: Ask[F, AppContext] - ): F[Unit] = + private[util] def installGalaxyVm( + dbCluster: KubernetesCluster, + app: App, + nfsDisk: PersistentDisk, + googleProject: GoogleProject, + restore: Boolean + )(implicit ev: Ask[F, AppContext]): F[Unit] = for { ctx <- ev.ask _ <- logger.info(ctx.loggingCtx)( - s"Installing helm chart $chart for app ${appName.value} in cluster ${dbCluster.getClusterId.toString}" + s"Installing Galaxy VM for app ${app.appName.value} in project ${googleProject.value}" ) - googleProject <- F.fromOption( - LeoLenses.cloudContextToGoogleProject.get(nfsDisk.cloudContext), - new RuntimeException("this should never happen. Galaxy disk's cloud context should be a google project") + + zoneParam = nfsDisk.zone + regionParam = RegionName(zoneParam.value.dropRight(2)) + + // Set up VPC and firewall + (network, subnetwork) <- vpcAlg.setUpProjectNetworkAndFirewalls( + SetUpProjectNetworkParams(googleProject, regionParam) ) - postgresDiskNameOpt <- for { - disk <- getGalaxyPostgresDisk(nfsDisk.name, namespaceName, googleProject, nfsDisk.zone) - } yield disk.map(x => DiskName(x.getName)) - postgresDiskName <- F.fromOption( - postgresDiskNameOpt, - AppCreationException(s"No postgres disk found in google for app ${appName.value} ", traceId = Some(ctx.traceId)) + // Load cloud-config content bundled from galaxy-k8s-boot bin/user_data.sh. + // Passed as the "user-data" metadata key, processed by cloud-init on first boot only. + // The galaxy-k8s-boot custom image has cloud-init pre-installed; "#cloud-config" must be + // the first line for cloud-init to recognise the file format. + // To update, sync manually from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh + userDataContent = scala.io.Source + .fromResource("init-resources/galaxy-user-data.sh") + .getLines() + .toList + .mkString("\n") + + // Derive postgres disk name using the same naming convention as the subscriber + postgresDiskName = GKEAlgebra.getGalaxyPostgresDiskName(nfsDisk.name, + config.galaxyDiskConfig.postgresDiskNameSuffix ) - chartValues = buildGalaxyChartOverrideValuesString( - config, - appName, - release, - dbCluster, - nodepoolName, - userEmail, - customEnvironmentVariables, - kubernetesServiceAccount, - namespaceName, - nfsDisk, - postgresDiskName, - machineType, - galaxyRestore + // Persistent-volume-size passed to ansible-pull. + // nfsDisk.size.gb is in decimal GB; convert to binary GiB before subtracting filesystem overhead. + // Example: a 500 GB disk = (500 * 10^9) / 2^30 ≈ 465 GiB, so we request 465 - 11 = 454 GiB. + // Using raw gb - 11 (treating GB as GiB) overestimates by ~23 GiB on a 500 GB disk and + // causes the NFS provisioner to fail with "insufficient available space". + diskSizeGiB = (nfsDisk.size.gb.toLong * 1000L * 1000L * 1000L) / (1024L * 1024L * 1024L) + pvSizeGi = math.max(1, diskSizeGiB - 11) + pvSize = s"${pvSizeGi}Gi" + + // Get or create the galaxy-batch-runner SA in the user's project. + gcpBatchSa <- F + .fromFuture( + F.delay( + googleIamDAO.getOrCreateServiceAccount( + googleProject, + org.broadinstitute.dsde.workbench.model.google.ServiceAccountName("galaxy-batch-runner"), + ServiceAccountDisplayName("Galaxy Batch Runner") + ) + ) + ) + .map(sa => sa.email.value) + + // Disks — data and postgres disks are always pre-existing by the time this method runs + // (created by createDiskOp / createSecondDiskOp, or retained from a previous app). + // Use setSource to attach existing disks; only the boot disk is created fresh. + bootDisk = AttachedDisk + .newBuilder() + .setBoot(true) + .setAutoDelete(true) + .setInitializeParams( + AttachedDiskInitializeParams + .newBuilder() + .setSourceImage(config.galaxyVmConfig.sourceImage.asString) + .setDiskSizeGb(config.galaxyVmConfig.bootDiskSizeGb.gb) + .putAllLabels(Map("leonardo" -> "true").asJava) + .build() + ) + .build() + + // Galaxy data disk — device name must match what the bootstrap script expects + dataDisk = AttachedDisk + .newBuilder() + .setBoot(false) + .setDeviceName("galaxy-data") + .setAutoDelete(false) + .setSource( + s"projects/${googleProject.value}/zones/${zoneParam.value}/disks/${nfsDisk.name.value}" + ) + .build() + + // PostgreSQL disk — device name must match what the bootstrap script expects + postgresDisk = AttachedDisk + .newBuilder() + .setBoot(false) + .setDeviceName("galaxy-postgres-data") + .setAutoDelete(false) + .setSource( + s"projects/${googleProject.value}/zones/${zoneParam.value}/disks/${postgresDiskName.value}" + ) + .build() + + // Network interface with external IP + networkInterface = NetworkInterface + .newBuilder() + .setSubnetwork( + buildSubnetworkUri(googleProject, regionParam, subnetwork) + ) + .addAccessConfigs(AccessConfig.newBuilder().setName("Leonardo Galaxy VM external IP").build()) + .build() + + instanceName = InstanceName(s"galaxy-${app.appName.value}") + + instance = Instance + .newBuilder() + .setName(instanceName.value) + .setDescription("Leonardo Galaxy VM") + .setTags(Tags.newBuilder().addItems(config.vpcNetworkTag.value).build()) + .setMachineType(buildMachineTypeUri(zoneParam, config.galaxyVmConfig.machineType)) + .addNetworkInterfaces(networkInterface) + .addAllDisks(List(bootDisk, dataDisk, postgresDisk).asJava) + .addServiceAccounts( + ServiceAccount + .newBuilder() + .setEmail(app.googleServiceAccount.value) + .addAllScopes( + List( + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/logging.write" + ).asJava + ) + .build() + ) + .setMetadata( + Metadata + .newBuilder() + .addItems(Items.newBuilder().setKey("user-data").setValue(userDataContent).build()) + .addItems(Items.newBuilder().setKey("google-logging-enabled").setValue("true").build()) + .addItems(Items.newBuilder().setKey("gcp_batch_service_account_email").setValue(gcpBatchSa).build()) + .addItems(Items.newBuilder().setKey("persistent-volume-size").setValue(pvSize).build()) + .addItems(Items.newBuilder().setKey("restore_galaxy").setValue(restore.toString).build()) + .addItems(Items.newBuilder().setKey("git-repo").setValue(config.galaxyVmConfig.gitRepo).build()) + .addItems(Items.newBuilder().setKey("git-branch").setValue(config.galaxyVmConfig.gitBranch).build()) + .addItems(Items.newBuilder().setKey("gcp-region").setValue(regionParam.value).build()) + .addItems(Items.newBuilder().setKey("gcp-network").setValue(network.value).build()) + .addItems(Items.newBuilder().setKey("gcp-subnet").setValue(subnetwork.value).build()) + // Galaxy needs to know its public URL prefix so it generates correct absolute links + // (JS, CSS, API calls) that include the full Leo proxy path. + // galaxy-k8s-boot's ansible playbook must accept galaxy_url_prefix and set it in + // Galaxy's helm values (galaxy.yml). Without this, Galaxy generates links rooted at / + // which the browser resolves against Leo's host and gets 404s → blank page. + .addItems( + Items + .newBuilder() + .setKey("galaxy-url-prefix") + .setValue( + s"/proxy/google/v1/apps/${googleProject.value}/${app.appName.value}/galaxy" + ) + .build() + ) + .build() + ) + .putAllLabels(Map("leonardo" -> "true").asJava) + .build() + + _ <- computeService.createInstance(googleProject, zoneParam, instance) + + // Grant the pet SA permission to submit and monitor GCP Batch jobs in this project. + // Galaxy uses the VM's attached SA (pet SA) to call the Batch API. + _ <- { + val call = F.fromFuture( + F.delay( + googleIamDAO + .addRoles(googleProject, + app.googleServiceAccount, + IamMemberTypes.ServiceAccount, + Set("roles/batch.jobsEditor") + ) + .void + ) + ) + val retryConfig = RetryPredicates.retryConfigWithPredicates(when409, whenGroupDoesNotExist) + tracedRetryF(retryConfig)( + call, + s"googleIamDAO.addRoles(batch.jobsEditor) for pet SA ${app.googleServiceAccount.value} in project ${googleProject.value}" + ).compile.lastOrError + } + + // Grant the pet SA serviceAccountUser on the Batch SA so it can specify it as the job runner identity. + // Only attempted when the Batch SA lives in the same project as the user (i.e. not a shared platform SA). + // For cross-project Batch SAs, this binding must be set up externally (e.g. via Terraform). + gcpBatchSaProject = GoogleProject( + gcpBatchSa.split("@").lastOption.getOrElse("").replace(".iam.gserviceaccount.com", "") ) + _ <- + if (gcpBatchSaProject == googleProject) + F.fromFuture( + F.delay( + googleIamDAO.addIamPolicyBindingOnServiceAccount( + googleProject, + WorkbenchEmail(gcpBatchSa), + app.googleServiceAccount, + Set("roles/iam.serviceAccountUser") + ) + ) + ) + else + logger.info(ctx.loggingCtx)( + s"Batch SA $gcpBatchSa is in a different project ($gcpBatchSaProject) than ${googleProject.value}; " + + s"skipping serviceAccountUser binding — must be configured externally" + ) + + // Grant the Batch SA the project-level roles it needs to run jobs and attach a service account to Batch VMs. + // See https://github.com/galaxyproject/galaxy-k8s-boot?tab=readme-ov-file#prerequisites + _ <- { + val call = F.fromFuture( + F.delay( + googleIamDAO + .addRoles(googleProject, + WorkbenchEmail(gcpBatchSa), + IamMemberTypes.ServiceAccount, + Set("roles/batch.jobsEditor", "roles/iam.serviceAccountUser") + ) + .void + ) + ) + val retryConfig = RetryPredicates.retryConfigWithPredicates(when409, whenGroupDoesNotExist) + tracedRetryF(retryConfig)( + call, + s"googleIamDAO.addRoles(batch.jobsEditor, iam.serviceAccountUser) for Batch SA $gcpBatchSa in project ${googleProject.value}" + ).compile.lastOrError + } + + // Create an NFS firewall rule so GCP Batch VMs can reach the Galaxy VM's NFS server. + // Idempotent: skipped if the rule already exists. + nfsFwName = FirewallRuleName("leonardo-galaxy-allow-nfs-for-batch") + nfsFwExists <- computeService.getFirewallRule(googleProject, nfsFwName) + _ <- + if (nfsFwExists.isEmpty) { + val nfsFirewall = Firewall + .newBuilder() + .setName(nfsFwName.value) + .setNetwork(s"projects/${googleProject.value}/global/networks/${network.value}") + .addSourceRanges("10.0.0.0/8") + .addTargetTags(config.vpcNetworkTag.value) + .addAllowed(Allowed.newBuilder().setIPProtocol("tcp").addPorts("2049").build()) + .addAllowed(Allowed.newBuilder().setIPProtocol("udp").addPorts("2049").build()) + .addAllowed(Allowed.newBuilder().setIPProtocol("tcp").addPorts("111").build()) + .addAllowed(Allowed.newBuilder().setIPProtocol("udp").addPorts("111").build()) + .build() + computeService + .addFirewallRule(googleProject, nfsFirewall) + .flatMap(op => F.blocking(op.get()).void) + } else + logger.info(ctx.loggingCtx)( + s"NFS firewall rule ${nfsFwName.value} already exists, skipping creation" + ) _ <- logger.info(ctx.loggingCtx)( - s"Chart override values are: ${chartValues.map(s => - if (s.contains("galaxyDatabasePassword")) "persistence.postgres.galaxyDatabasePassword=" - else s - )}" + s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP" ) - // Invoke helm - helmInstall = helmClient - .installChart( - release, - chart.name, - chart.version, - org.broadinstitute.dsp.Values(chartValues.mkString(",")), - false + // Poll until the instance has both internal and external IPs assigned. + // We store the internal IP as the proxy backend (KubernetesDnsCache loadBalancerIp) so that + // the Leo proxy connects to the VM over the internal VPC network using plain HTTP. + // The external IP is only used for the readiness health check (TCP to port 80). + ipPairOpt <- streamFUntilDone( + computeService.getInstance(googleProject, zoneParam, instanceName).map { instanceOpt => + instanceOpt.flatMap { inst => + import scala.jdk.CollectionConverters._ + for { + iface <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption) + internalIp = IP(iface.getNetworkIP) + cfg <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption) + natIp <- Option(cfg.getNatIP).filter(_.nonEmpty) + } yield (internalIp, IP(natIp)) + } + }, + config.monitorConfig.createApp.maxAttempts, + config.monitorConfig.createApp.interval + ).compile.lastOrError + + (internalIp, externalIp) <- F.fromOption( + ipPairOpt, + AppCreationException( + s"Galaxy VM ${instanceName.value} did not obtain an IP after ${config.monitorConfig.createApp.interruptAfter}", + traceId = Some(ctx.traceId) ) - .run(helmAuthContext) + ) - // Currently we always retry. - // The main failure mode here is helm install, which does not have easily interpretable error codes - retryConfig = RetryPredicates.retryAllConfig - _ <- tracedRetryF(retryConfig)( - helmInstall, - s"helm install for app ${appName.value} in project ${dbCluster.cloudContext.asString}" - ).compile.lastOrError + _ <- logger.info(ctx.loggingCtx)( + s"Galaxy VM ${instanceName.value} has internal IP ${internalIp.asString} / external IP ${externalIp.asString}; storing external IP in cluster async fields for proxy access" + ) - googleProject <- F.fromOption( - LeoLenses.cloudContextToGoogleProject.get(dbCluster.cloudContext), - new RuntimeException("trying to create a non google runtime in GKEInterpreter. This should never happen") + // Store the VM's external IP as the cluster load balancer IP consumed by KubernetesDnsCache. + // The proxy will connect to this IP via HTTP on port 80 (Galaxy VM serves HTTP, not HTTPS). + // We use the external IP because Leo's GKE cluster is in Leo's GCP project while the Galaxy VM + // is in the user's workspace project — the two VPCs are not peered, so the internal IP is + // not routable from Leo's pod. The leonardo-allow-http firewall rule (0.0.0.0/0 → port 80, + // targeting VMs with the "leonardo" tag) allows Leo to reach the VM on its external IP. + _ <- kubernetesClusterQuery + .updateAsyncFields( + dbCluster.id, + KubernetesClusterAsyncFields( + externalIp, + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) + ) + ) + .transaction + _ <- kubernetesClusterQuery.updateStatus(dbCluster.id, KubernetesClusterStatus.Running).transaction + + _ <- logger.info(ctx.loggingCtx)( + s"Polling Galaxy readiness for app ${app.appName.value} via proxy (backend: ${externalIp.asString}:80)" ) - // Poll galaxy until it starts up - // TODO potentially add other status checks for pod readiness, beyond just HTTP polling the galaxy-web service - // Wait a bit before starting polling for the app status check as the certificates might not be quite ready yet - // This seems to only impact galaxy, See https://broadworkbench.atlassian.net/browse/IA-4551 - _ <- F.sleep(60 seconds) + + // Wait for Galaxy's nginx to respond. + // Uses a direct HTTP check to the VM's external IP on port 80, bypassing the Leo proxy + // hostname chain (which would require the proxy wildcard DNS to be reachable from within + // the Leo pod — unreliable in BEE environments due to hairpin NAT). isDone <- streamFUntilDone( - appDao.isProxyAvailable(googleProject, appName, ServiceName("galaxy"), ctx.traceId), + appDao.isVmReachable(externalIp, 80, ctx.traceId), config.monitorConfig.createApp.maxAttempts, config.monitorConfig.createApp.interval ).interruptAfter(config.monitorConfig.createApp.interruptAfter).compile.lastOrError @@ -1180,9 +1367,9 @@ class GKEInterpreter[F[_]]( _ <- if (!isDone) { val msg = - s"Galaxy installation has failed or timed out for app ${appName.value} in cluster ${dbCluster.getClusterId.toString}" + s"Galaxy VM installation has failed or timed out for app ${app.appName.value} in project ${googleProject.value}" logger.error(ctx.loggingCtx)(msg) >> - F.raiseError[Unit](AppCreationException(msg)) + F.raiseError[Unit](AppCreationException(msg, traceId = Some(ctx.traceId))) } else F.unit } yield () @@ -1833,14 +2020,14 @@ final case class GKEInterpreterConfig(leoUrlBase: URL, vpcNetworkTag: NetworkTag, terraAppSetupChartConfig: TerraAppSetupChartConfig, ingressConfig: KubernetesIngressConfig, - galaxyAppConfig: GalaxyAppConfig, cromwellAppConfig: CromwellAppConfig, customAppConfig: CustomAppConfig, allowedAppConfig: AllowedAppConfig, monitorConfig: AppMonitorConfig, clusterConfig: KubernetesClusterConfig, proxyConfig: ProxyConfig, - galaxyDiskConfig: GalaxyDiskConfig + galaxyDiskConfig: GalaxyDiskConfig, + galaxyVmConfig: GalaxyVmConfig ) final case class TerraAppSetupChartConfig( diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala index 8ee7c2c2cb..647f3cc77f 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala @@ -3,7 +3,7 @@ package org.broadinstitute.dsde.workbench.leonardo.dao import cats.effect.IO import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.ServiceName import org.broadinstitute.dsde.workbench.leonardo.AppName -import org.broadinstitute.dsde.workbench.model.TraceId +import org.broadinstitute.dsde.workbench.model.{IP, TraceId} import org.broadinstitute.dsde.workbench.model.google.GoogleProject class MockAppDAO(isUp: Boolean = true) extends AppDAO[IO] { @@ -13,5 +13,8 @@ class MockAppDAO(isUp: Boolean = true) extends AppDAO[IO] { traceId: TraceId ): IO[Boolean] = IO.pure(isUp) + + override def isVmReachable(ip: IP, port: Int, traceId: TraceId): IO[Boolean] = + IO.pure(isUp) } object MockAppDAO extends MockAppDAO(isUp = true) diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala index d932498dc3..3da67b1856 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala @@ -10,7 +10,7 @@ import cats.mtl.Ask import cats.syntax.all._ import com.github.benmanes.caffeine.cache.Caffeine import com.google.api.gax.longrunning.OperationFuture -import com.google.cloud.compute.v1.{Disk, Operation} +import com.google.cloud.compute.v1.{AccessConfig, Disk, Instance, NetworkInterface, Operation} import com.google.protobuf.Timestamp import fs2.Stream import org.broadinstitute.dsde.workbench.google.GoogleStorageDAO @@ -21,13 +21,16 @@ import org.broadinstitute.dsde.workbench.google2.mock.{MockKubernetesService => import org.broadinstitute.dsde.workbench.google2.{ DiskName, GKEModels, + GoogleComputeService, GoogleDiskService, GoogleStorageService, KubernetesModels, MachineTypeName, - RegionName, + NetworkName, + SubnetworkName, ZoneName } +import org.broadinstitute.dsde.workbench.util2.InstanceName import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore import org.broadinstitute.dsde.workbench.leonardo.AsyncTaskProcessor.Task import org.broadinstitute.dsde.workbench.leonardo.CommonTestData._ @@ -97,6 +100,25 @@ class LeoPubsubMessageSubscriberSpec ): Future[Unit] = Future.successful(()) } val iamDAO = new MockGoogleIamDAO + + // Returns a GCE instance with internal IP "10.0.0.1" and external IP "1.2.3.4" so Galaxy VM IP polling succeeds in tests. + val galaxyComputeService: GoogleComputeService[IO] = new FakeGoogleComputeService { + override def getInstance(project: GoogleProject, zone: ZoneName, instanceName: InstanceName)(implicit + ev: Ask[IO, TraceId] + ): IO[Option[Instance]] = { + val inst = Instance + .newBuilder() + .addNetworkInterfaces( + NetworkInterface + .newBuilder() + .setNetworkIP("10.0.0.1") + .addAccessConfigs(AccessConfig.newBuilder().setNatIP("1.2.3.4").build()) + .build() + ) + .build() + IO.pure(Some(inst)) + } + } val resourceService = new FakeGoogleResourceService { override def getProjectNumber(project: GoogleProject)(implicit ev: Ask[IO, TraceId]): IO[Option[Long]] = IO(Some(1L)) @@ -903,32 +925,29 @@ class LeoPubsubMessageSubscriberSpec getDisk = getDiskOpt.get appRestore <- persistentDiskQuery.getAppDiskRestore(savedApp1.appResources.disk.get.id).transaction galaxyRestore = appRestore.map(_.asInstanceOf[GalaxyRestore]) - ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap - .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1")) } yield { getCluster.status shouldBe KubernetesClusterStatus.Running getCluster.nodepools.size shouldBe 2 - getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Unspecified getApp.app.errors shouldBe List.empty getApp.app.status shouldBe AppStatus.Running getApp.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) getApp.cluster.status shouldBe KubernetesClusterStatus.Running - getApp.nodepool.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getApp.nodepool.status shouldBe NodepoolStatus.Unspecified + // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated getApp.cluster.asyncFields shouldBe Some( KubernetesClusterAsyncFields(IP("1.2.3.4"), - IP("0.0.0.0"), - NetworkFields(Config.vpcConfig.networkName, - Config.vpcConfig.subnetworkName, - ipRange - ) + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) ) getDisk.status shouldBe DiskStatus.Ready - galaxyRestore shouldBe Some( - GalaxyRestore(PvcId(s"nfs-pvc-id1"), getApp.app.id) - ) + // Galaxy VM path does not use PVCs — no GalaxyRestore is recorded + galaxyRestore shouldBe None } implicit val gkeAlg: GKEAlgebra[IO] = makeGKEInterp(nodepoolLock, List(savedApp1.release)) @@ -1062,25 +1081,22 @@ class LeoPubsubMessageSubscriberSpec .transaction getApp1 = getAppOpt1.get getApp2 = getAppOpt2.get - ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap - .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1")) } yield { getApp1.cluster.status shouldBe KubernetesClusterStatus.Running getApp2.cluster.status shouldBe KubernetesClusterStatus.Running - getApp1.nodepool.status shouldBe NodepoolStatus.Running - getApp2.nodepool.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getApp1.nodepool.status shouldBe NodepoolStatus.Unspecified + getApp2.nodepool.status shouldBe NodepoolStatus.Unspecified getApp1.app.errors shouldBe List() getApp1.app.status shouldBe AppStatus.Running getApp1.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) + // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated getApp1.cluster.asyncFields shouldBe Some( KubernetesClusterAsyncFields(IP("1.2.3.4"), - IP("0.0.0.0"), - NetworkFields(Config.vpcConfig.networkName, - Config.vpcConfig.subnetworkName, - ipRange - ) + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) ) getApp2.app.errors shouldBe List() @@ -1298,7 +1314,9 @@ class LeoPubsubMessageSubscriberSpec it should "handle an error in delete app" in isolatedDbTest { val savedCluster1 = makeKubeCluster(1).save() val savedNodepool1 = makeNodepool(1, savedCluster1.id).save() - val savedApp1 = makeApp(1, savedNodepool1.id).save() + // Use Cromwell (GKE/Helm path) so the deleteNamespace error triggers AppStatus.Error. + // Galaxy now uses the VM path which swallows deleteInstance errors gracefully. + val savedApp1 = makeApp(1, savedNodepool1.id, appType = AppType.Cromwell).save() val mockAckConsumer = mock[AckHandler] val assertions = for { @@ -1432,7 +1450,7 @@ class LeoPubsubMessageSubscriberSpec val savedNodepool1 = makeNodepool(1, savedCluster1.id).save() val disk = makePersistentDisk(None).save().unsafeRunSync()(cats.effect.unsafe.IORuntime.global) - val makeApp1 = makeApp(1, savedNodepool1.id) + val makeApp1 = makeApp(1, savedNodepool1.id, appType = AppType.Cromwell) val savedApp1 = makeApp1 .copy(appResources = makeApp1.appResources.copy( @@ -1523,7 +1541,7 @@ class LeoPubsubMessageSubscriberSpec savedApp1.appName, Some(disk.id), Map.empty, - AppType.Galaxy, + AppType.Cromwell, savedApp1.appResources.namespace, None, Some(tr), @@ -1676,7 +1694,9 @@ class LeoPubsubMessageSubscriberSpec savedApp1.appName, None, Map.empty, - AppType.Galaxy, + // Use Cromwell so the GKE cluster creation path is taken and mockGKEService.createCluster can throw. + // Galaxy skips cluster creation (VM path), so the mock would have no effect. + AppType.Cromwell, savedApp1.appResources.namespace, None, Some(tr), @@ -1821,26 +1841,24 @@ class LeoPubsubMessageSubscriberSpec getApp = getAppOpt.get getDiskOpt <- persistentDiskQuery.getById(savedApp1.appResources.disk.get.id).transaction getDisk = getDiskOpt.get - ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap - .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1")) } yield { getCluster.status shouldBe KubernetesClusterStatus.Running getCluster.nodepools.size shouldBe 2 - getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Unspecified getApp.app.errors shouldBe List() getApp.app.status shouldBe AppStatus.Running getApp.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) getApp.cluster.status shouldBe KubernetesClusterStatus.Running - getApp.nodepool.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getApp.nodepool.status shouldBe NodepoolStatus.Unspecified + // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated getApp.cluster.asyncFields shouldBe Some( KubernetesClusterAsyncFields(IP("1.2.3.4"), - IP("0.0.0.0"), - NetworkFields(Config.vpcConfig.networkName, - Config.vpcConfig.subnetworkName, - ipRange - ) + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) ) getDisk.status shouldBe DiskStatus.Ready @@ -1872,7 +1890,8 @@ class LeoPubsubMessageSubscriberSpec false, Some(GcsBucketName("fc-bucket")) ) - asyncTaskProcessor = AsyncTaskProcessor(AsyncTaskProcessor.Config(10, 10), queue) + // maxConcurrentTasks=1 ensures tasks run sequentially so the idempotency check fires for the 2nd task + asyncTaskProcessor = AsyncTaskProcessor(AsyncTaskProcessor.Config(10, 1), queue) // send message twice _ <- leoSubscriber.handleCreateAppMessage(msg) _ <- leoSubscriber.handleCreateAppMessage(msg) @@ -2024,7 +2043,7 @@ class LeoPubsubMessageSubscriberSpec MockAppDescriptorDAO, lock, resourceService, - FakeGoogleComputeService + galaxyComputeService ) def makeLeoSubscriber( diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala index 6ea504a907..a408cd4559 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala @@ -4,7 +4,6 @@ package util import org.broadinstitute.dsde.workbench.google2.DiskName import org.broadinstitute.dsde.workbench.google2.GKEModels.NodepoolName import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.{NamespaceName, ServiceAccountName} -import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore import org.broadinstitute.dsde.workbench.leonardo.CommonTestData.{makePersistentDisk, userEmail, userEmail2} import org.broadinstitute.dsde.workbench.leonardo.KubernetesTestData.{makeCustomAppService, makeKubeCluster} import org.broadinstitute.dsde.workbench.leonardo.config.Config @@ -16,177 +15,6 @@ import org.scalatest.flatspec.AnyFlatSpecLike class BuildHelmChartValuesSpec extends AnyFlatSpecLike with LeonardoTestSuite { - it should "build Galaxy override values string" in { - val savedCluster1 = makeKubeCluster(1) - val savedDisk1 = makePersistentDisk(Some(DiskName("disk1")), Some(FormattedBy.Galaxy)) - val res = buildGalaxyChartOverrideValuesString( - Config.gkeInterpConfig, - AppName("app1"), - Release("app1-galaxy-rls"), - savedCluster1, - NodepoolName("pool1"), - userEmail, - Map("WORKSPACE_NAME" -> "test-workspace", - "WORKSPACE_BUCKET" -> "gs://test-bucket", - "WORKSPACE_NAMESPACE" -> "dsp-leo-test1" - ), - ServiceAccountName("app1-galaxy-ksa"), - NamespaceName("ns"), - savedDisk1, - DiskName("disk1-gxy-postres-disk"), - AppMachineType(23, 7), - None - ) - - res.mkString( - "," - ) shouldBe - """nfs.storageClass.name=nfs-app1-galaxy-rls,""" + - """galaxy.persistence.storageClass=nfs-app1-galaxy-rls,""" + - """galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """nfs.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: pool1,""" + - """galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.ingress.path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=https://leo,""" + - """galaxy.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.tls[0].secretName=tls-secret,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-app1-galaxy-rls,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache,""" + - """galaxy.configs.galaxy\.yml.galaxy.single_user=user1@example.com,""" + - """galaxy.configs.galaxy\.yml.galaxy.admin_users=user1@example.com,""" + - """galaxy.terra.launch.workspace=test-workspace,""" + - """galaxy.terra.launch.namespace=dsp-leo-test1,""" + - """galaxy.terra.launch.apiURL=https://firecloud-orchestration.dsde-dev.broadinstitute.org/api/,""" + - """galaxy.terra.launch.drsURL=https://drshub.dsde-dev.broadinstitute.org/api/v4/drs/resolve,""" + - """galaxy.tusd.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy/api/upload/resumable_upload,""" + - """galaxy.tusd.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.tls[0].secretName=tls-secret,""" + - """galaxy.rabbitmq.persistence.storageClassName=nfs-app1-galaxy-rls,""" + - """galaxy.jobs.maxLimits.memory=23,""" + - """galaxy.jobs.maxLimits.cpu=7,""" + - """galaxy.jobs.maxRequests.memory=1,""" + - """galaxy.jobs.maxRequests.cpu=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=1,""" + - """galaxy.serviceAccount.create=false,""" + - """galaxy.serviceAccount.name=app1-galaxy-ksa,""" + - """rbac.serviceAccount=app1-galaxy-ksa,persistence.nfs.name=ns-nfs-disk,""" + - """persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1,""" + - """persistence.nfs.size=250Gi,""" + - """persistence.postgres.name=ns-postgres-disk,""" + - """galaxy.postgresql.galaxyDatabasePassword=replace-me,""" + - """persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1-gxy-postres-disk,""" + - """persistence.postgres.size=10Gi,""" + - """nfs.persistence.existingClaim=ns-nfs-disk-pvc,""" + - """nfs.persistence.size=250Gi,""" + - """galaxy.postgresql.persistence.existingClaim=ns-postgres-disk-pvc,""" + - """galaxy.persistence.size=200Gi,""" + - """configs.WORKSPACE_NAME=test-workspace,""" + - """extraEnv[0].name=WORKSPACE_NAME,extraEnv[0].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[0].valueFrom.configMapKeyRef.key=WORKSPACE_NAME,""" + - """configs.WORKSPACE_BUCKET=gs://test-bucket,""" + - """extraEnv[1].name=WORKSPACE_BUCKET,""" + - """extraEnv[1].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[1].valueFrom.configMapKeyRef.key=WORKSPACE_BUCKET,""" + - """configs.WORKSPACE_NAMESPACE=dsp-leo-test1,""" + - """extraEnv[2].name=WORKSPACE_NAMESPACE,""" + - """extraEnv[2].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[2].valueFrom.configMapKeyRef.key=WORKSPACE_NAMESPACE""" - } - - it should "build Galaxy override values string with restore info" in { - val savedCluster1 = makeKubeCluster(1) - val savedDisk1 = makePersistentDisk(Some(DiskName("disk1")), Some(FormattedBy.Galaxy)) - val result = - buildGalaxyChartOverrideValuesString( - Config.gkeInterpConfig, - AppName("app1"), - Release("app1-galaxy-rls"), - savedCluster1, - NodepoolName("pool1"), - userEmail, - Map("WORKSPACE_NAME" -> "test-workspace", - "WORKSPACE_BUCKET" -> "gs://test-bucket", - "WORKSPACE_NAMESPACE" -> "dsp-leo-test1" - ), - ServiceAccountName("app1-galaxy-ksa"), - NamespaceName("ns"), - savedDisk1, - DiskName("disk1-gxy-postres"), - AppMachineType(23, 7), - Some( - GalaxyRestore(PvcId("galaxy-pvc-id"), AppId(123)) - ) - ) - result.mkString( - "," - ) shouldBe - """nfs.storageClass.name=nfs-app1-galaxy-rls,""" + - """galaxy.persistence.storageClass=nfs-app1-galaxy-rls,""" + - """galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """nfs.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: pool1,""" + - """galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.ingress.path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=https://leo,""" + - """galaxy.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.tls[0].secretName=tls-secret,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-app1-galaxy-rls,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache,""" + - """galaxy.configs.galaxy\.yml.galaxy.single_user=user1@example.com,""" + - """galaxy.configs.galaxy\.yml.galaxy.admin_users=user1@example.com,""" + - """galaxy.terra.launch.workspace=test-workspace,""" + - """galaxy.terra.launch.namespace=dsp-leo-test1,""" + - """galaxy.terra.launch.apiURL=https://firecloud-orchestration.dsde-dev.broadinstitute.org/api/,""" + - """galaxy.terra.launch.drsURL=https://drshub.dsde-dev.broadinstitute.org/api/v4/drs/resolve,""" + - """galaxy.tusd.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy/api/upload/resumable_upload,""" + - """galaxy.tusd.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.tls[0].secretName=tls-secret,""" + - """galaxy.rabbitmq.persistence.storageClassName=nfs-app1-galaxy-rls,""" + - """galaxy.jobs.maxLimits.memory=23,""" + - """galaxy.jobs.maxLimits.cpu=7,""" + - """galaxy.jobs.maxRequests.memory=1,""" + - """galaxy.jobs.maxRequests.cpu=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=1,""" + - """galaxy.serviceAccount.create=false,""" + - """galaxy.serviceAccount.name=app1-galaxy-ksa,""" + - """rbac.serviceAccount=app1-galaxy-ksa,""" + - """persistence.nfs.name=ns-nfs-disk,""" + - """persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1,persistence.nfs.size=250Gi,""" + - """persistence.postgres.name=ns-postgres-disk,""" + - """galaxy.postgresql.galaxyDatabasePassword=replace-me,""" + - """persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1-gxy-postres,""" + - """persistence.postgres.size=10Gi,""" + - """nfs.persistence.existingClaim=ns-nfs-disk-pvc,""" + - """nfs.persistence.size=250Gi,""" + - """galaxy.postgresql.persistence.existingClaim=ns-postgres-disk-pvc,""" + - """galaxy.persistence.size=200Gi,""" + - """configs.WORKSPACE_NAME=test-workspace,""" + - """extraEnv[0].name=WORKSPACE_NAME,""" + - """extraEnv[0].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[0].valueFrom.configMapKeyRef.key=WORKSPACE_NAME,""" + - """configs.WORKSPACE_BUCKET=gs://test-bucket,""" + - """extraEnv[1].name=WORKSPACE_BUCKET,""" + - """extraEnv[1].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[1].valueFrom.configMapKeyRef.key=WORKSPACE_BUCKET,""" + - """configs.WORKSPACE_NAMESPACE=dsp-leo-test1,""" + - """extraEnv[2].name=WORKSPACE_NAMESPACE,""" + - """extraEnv[2].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[2].valueFrom.configMapKeyRef.key=WORKSPACE_NAMESPACE,""" + - """restore.persistence.nfs.galaxy.pvcID=galaxy-pvc-id,""" + - """galaxy.persistence.existingClaim=app1-galaxy-rls-galaxy-galaxy-pvc""".stripMargin - } - it should "build Cromwell override values string" in { val savedCluster1 = makeKubeCluster(1) val savedDisk1 = makePersistentDisk(Some(DiskName("disk1"))) diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala index c6ad02ccc5..5b6a8b2190 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala @@ -87,7 +87,7 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite { SetUpProjectFirewallsParams(project, vpcConfig.networkName, RegionName("us-central1"), Map.empty) ) .unsafeRunSync() - computeService.firewallMap.size shouldBe 4 + computeService.firewallMap.size shouldBe 5 vpcConfig.firewallsToAdd.foreach { fwConfig => val fw = computeService.firewallMap.get(FirewallRuleName(s"${fwConfig.namePrefix}-us-central1")) fw shouldBe defined @@ -155,6 +155,12 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite { ) val test = new VPCInterpreter(Config.vpcInterpreterConfig, stubResourceService(Map.empty), computeService) + val expectedHttpFirewallRules = FirewallRuleConfig( + "leonardo-allow-http", + None, + allSupportedRegions.map(r => r -> List(IpRange("0.0.0.0/0"))).toMap, + List(Allowed("tcp", Some("80"))) + ) test .firewallRulesToAdd( Map( @@ -162,7 +168,7 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite { "leonardo-allow-https-firewall-name" -> "leonardo-ssl" ) ) - .toSet shouldBe Set(expectedSshFirewallRules, expectedIapFirewallRules) + .toSet shouldBe Set(expectedHttpFirewallRules, expectedSshFirewallRules, expectedIapFirewallRules) } private def stubResourceService(labels: Map[String, String]): FakeGoogleResourceService =