diff --git a/all-apps/instance-control/hooks/hooks.json.tmpl b/all-apps/instance-control/hooks/hooks.json.tmpl index b882617..5601da6 100644 --- a/all-apps/instance-control/hooks/hooks.json.tmpl +++ b/all-apps/instance-control/hooks/hooks.json.tmpl @@ -20,6 +20,27 @@ }, "execute-command": "/etc/webhook/queue-restic-snapshot.sh" }, + { + "id": "queue-restic-snapshot-no-restart", + "pass-environment-to-command": [ + {"source": "payload", "name": "version"}, + {"source": "payload", "name": "path"}, + {"source": "payload", "name": "tag"}, + {"source": "payload", "name": "request_id"} + ], + "trigger-rule": + { + "match": { + "type": "payload-hmac-sha256", + "secret": "$INSTANCE_CONTROL_WEBHOOKS_SECRET", + "parameter": { + "source": "header", + "name": "X-Nassella-Signature" + } + } + }, + "execute-command": "/etc/webhook/queue-restic-snapshot-no-restart.sh" + }, { "id": "restic-snapshot-status", "include-command-output-in-response": true, diff --git a/all-apps/instance-control/hooks/queue-restic-snapshot-no-restart.sh b/all-apps/instance-control/hooks/queue-restic-snapshot-no-restart.sh new file mode 100755 index 0000000..9391119 --- /dev/null +++ b/all-apps/instance-control/hooks/queue-restic-snapshot-no-restart.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +# TODO the systemd unit should actually do this +# touch /maintenance/maintenance.on +# rm /maintenance/maintenance.on + +# for instance-control docker compose setup: +# make a directory in /tmp for these pipes and mount that as a volume +# into the container + +# TODO read 'version' arg from request and make sure it +# matches the version of this script + +# use a named pipe +printf "%s\t%s\t%s\t%s\n" "$HOOK_tag" "$HOOK_request_id" "$HOOK_path" "false" > /tmp/restic/snapshot_trigger_pipe diff --git a/all-apps/instance-control/hooks/queue-restic-snapshot.sh b/all-apps/instance-control/hooks/queue-restic-snapshot.sh index be25037..9409ecc 100755 --- a/all-apps/instance-control/hooks/queue-restic-snapshot.sh +++ b/all-apps/instance-control/hooks/queue-restic-snapshot.sh @@ -12,4 +12,4 @@ # matches the version of this script # use a named pipe -printf "%s\t%s\t%s\n" "$HOOK_tag" "$HOOK_request_id" "$HOOK_path" > /tmp/restic/snapshot_trigger_pipe +printf "%s\t%s\t%s\t%s\n" "$HOOK_tag" "$HOOK_request_id" "$HOOK_path" "true" > /tmp/restic/snapshot_trigger_pipe diff --git a/config/production.tfvars.tmpl b/config/production.tfvars.tmpl index 128469f..1515675 100644 --- a/config/production.tfvars.tmpl +++ b/config/production.tfvars.tmpl @@ -2,6 +2,8 @@ server_type = "s-2vcpu-2gb" # the digital ocean server type to deploy do_token = "" # token from "API" settings on DigitalOcean +digitalocean_volume_size = # size in GB of the app storage volume + cloudflare_api_token = "" # corresponding API token should allow modifying DNS settings for the Nassella configured domain cloudflare_zone_id = "" # corresponding zone ID for API token for the Nassella configured domain cloudflare_account_id = "" # corresponding account ID for API token diff --git a/main.tf b/main.tf index e8cb296..0b876ff 100644 --- a/main.tf +++ b/main.tf @@ -79,6 +79,11 @@ variable "subdomains" { description = "Subdomains to setup" } +variable "digitalocean_volume_size" { + type = number + description = "Size in GB of the app storage digitalocean volume" +} + provider "digitalocean" { token = var.do_token } @@ -124,7 +129,7 @@ resource "cloudflare_dns_record" "subdomains" { resource "digitalocean_volume" "machine" { region = var.datacenter name = "${var.cluster_name}" - size = 60 + size = var.digitalocean_volume_size initial_filesystem_type = "ext4" initial_filesystem_label = "appstorage" description = "persistent storage for docker apps" @@ -137,7 +142,6 @@ resource "digitalocean_droplet" "machine" { size = var.server_type ssh_keys = [digitalocean_ssh_key.first.fingerprint] user_data = file("ignition.json") - graceful_shutdown = true lifecycle { create_before_destroy = true } diff --git a/make-caddyfile.sh b/make-caddyfile.sh index 74ee705..398de17 100755 --- a/make-caddyfile.sh +++ b/make-caddyfile.sh @@ -66,13 +66,15 @@ for config_string in ${APP_CONFIGS[@]}; do fulldomain="$subdomain.$ROOT_DOMAIN" echo "$fulldomain {" - # config for maintenance mode - echo "@maintenanceModeActive file /maintenance/maintenance.on {" - echo " root /" - echo "}" - echo "handle @maintenanceModeActive {" - echo " respond \"We are performing a maintenance, come back later\" 503" - echo "}" + if [ "$app" != "instance-control" ] && [ "$app" != "dozzle" ]; then + # config for maintenance mode + echo "@maintenanceModeActive file /maintenance/maintenance.on {" + echo " root /" + echo "}" + echo "handle @maintenanceModeActive {" + echo " respond \"We are performing a maintenance, come back later\" 503" + echo "}" + fi echo $body echo "}" diff --git a/restic-snapshot.sh b/restic-snapshot.sh index 0af46dd..3b4fc70 100644 --- a/restic-snapshot.sh +++ b/restic-snapshot.sh @@ -25,6 +25,7 @@ while read -u 3 msg; do tag=$1 request_id=$2 path=$3 # TODO not currently used + restart=$4 # if we should restart the databases after the snapshot or stay in maintenance mode # update status for webhooks printf "%s\n" "running" > "/tmp/restic/snapshot_status_$request_id" @@ -60,27 +61,29 @@ while read -u 3 msg; do # perform the snapshot docker run --rm --volume /nassella:/nassella --volume /restic-password:/restic-password -e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} -e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} -i restic/restic:0.18.0 backup --verbose --repo s3:${BACKBLAZE_BUCKET_URL} --password-file /restic-password --tag "$tag" "/nassella" - # restart databases - if [ $ghost_db_running = true ]; then - docker start app-ghost_db-1 - fi - if [ $nassella_lldap_db_running = true ]; then - docker start app-nassella_lldap_db-1 - fi - if [ $nassella_authelia_db_running = true ]; then - docker start app-nassella_authelia_db-1 - fi - if [ $nassella_db_running = true ]; then - docker start app-nassella_db-1 - fi - if [ $nextcloud_db_running = true ]; then - docker start app-nextcloud_db-1 - fi - if [ $nextcloud_redis_running = true ]; then - docker start app-nextcloud_redis-1 - fi + if [ $restart = "true" ]; then + # restart databases + if [ $ghost_db_running = true ]; then + docker start app-ghost_db-1 + fi + if [ $nassella_lldap_db_running = true ]; then + docker start app-nassella_lldap_db-1 + fi + if [ $nassella_authelia_db_running = true ]; then + docker start app-nassella_authelia_db-1 + fi + if [ $nassella_db_running = true ]; then + docker start app-nassella_db-1 + fi + if [ $nextcloud_db_running = true ]; then + docker start app-nextcloud_db-1 + fi + if [ $nextcloud_redis_running = true ]; then + docker start app-nextcloud_redis-1 + fi - rm /app/maintenance/maintenance.on + rm /app/maintenance/maintenance.on + fi # update status for webhooks printf "%s\n" "complete" > "/tmp/restic/snapshot_status_$request_id" diff --git a/src/db-init.sql b/src/db-init.sql index 8c5d150..b897d28 100644 --- a/src/db-init.sql +++ b/src/db-init.sql @@ -26,6 +26,7 @@ create table user_service_configs( digitalocean_api_token_enc varchar(255), digitalocean_region varchar(255), digitalocean_size varchar(255), + digitalocean_volume_size integer, backblaze_application_key_enc varchar(255), backblaze_key_id_enc varchar(255), backblaze_bucket_url_enc varchar(255) @@ -75,6 +76,7 @@ create table deployments( terraform_machine_destroy deployment_status not null default 'queued', terraform_ip_create deployment_status not null default 'queued', terraform_ip_destroy deployment_status not null default 'queued', + instance_backup deployment_status not null default 'queued', log_enc text ); diff --git a/src/db.scm b/src/db.scm index 3dc35ee..9cdafbd 100644 --- a/src/db.scm +++ b/src/db.scm @@ -230,6 +230,7 @@ returning users.user_id;" (digitalocean-api-token . ("digitalocean_api_token_enc" #t)) (digitalocean-region . ("digitalocean_region" #f)) (digitalocean-size . ("digitalocean_size" #f)) + (digitalocean-volume-size . ("digitalocean_volume_size" #f)) (backblaze-application-key . ("backblaze_application_key_enc" #t)) (backblaze-key-id . ("backblaze_key_id_enc" #t)) (backblaze-bucket-url . ("backblaze_bucket_url_enc" #t)))) @@ -416,6 +417,11 @@ returning users.user_id;" (custom-image . "terraform_custom_image") (machine-create . "terraform_machine_create") (machine-destroy . "terraform_machine_destroy") + (ip-create . "terraform_ip_create") + (ip-destroy . "terraform_ip_destroy") + (volume-create . "terraform_volume_create") + (volume-destroy . "terraform_volume_destroy") + (instance-backup . "instance_backup") (status . "status") (id . "id") (instance-id . "instance_id"))) @@ -621,13 +627,15 @@ returning users.user_id;" ;; The "up" file is called to run the migration and the "down" file is called to ;; "undo" the migration. (define *migrations* - '((0 . "adding-instance-control-app"))) + '((0 . "adding-instance-control-app") + (1 . "adding-service-config-digitalocean-volume-size") + (2 . "adding-deployments-instance-backup"))) (define (run-pending-migrations conn) (let* ((migration-ids (sort (map car *migrations*) <)) (migration-rows (query conn "select migration_id from migrations;")) (applied-migration-ids (if (> (row-count migration-rows) 0) - (row-values migration-rows) + (column-values migration-rows) '()))) (for-each (lambda (id) diff --git a/src/migrations/0-adding-instance-control-app-down.sql b/src/migrations/0-adding-instance-control-app-down.sql new file mode 100644 index 0000000..abc39c8 --- /dev/null +++ b/src/migrations/0-adding-instance-control-app-down.sql @@ -0,0 +1 @@ +alter table user_selected_apps drop column instance_control_version; diff --git a/src/migrations/0-adding-instance-control-app-up.sql b/src/migrations/0-adding-instance-control-app-up.sql new file mode 100644 index 0000000..bd0fb26 --- /dev/null +++ b/src/migrations/0-adding-instance-control-app-up.sql @@ -0,0 +1 @@ +alter table user_selected_apps add instance_control_version varchar(100); diff --git a/src/migrations/1-adding-service-config-digitalocean-volume-size-down.sql b/src/migrations/1-adding-service-config-digitalocean-volume-size-down.sql new file mode 100644 index 0000000..b167c5f --- /dev/null +++ b/src/migrations/1-adding-service-config-digitalocean-volume-size-down.sql @@ -0,0 +1 @@ +alter table user_service_configs drop column digitalocean_volume_size; diff --git a/src/migrations/1-adding-service-config-digitalocean-volume-size-up.sql b/src/migrations/1-adding-service-config-digitalocean-volume-size-up.sql new file mode 100644 index 0000000..dbe2b98 --- /dev/null +++ b/src/migrations/1-adding-service-config-digitalocean-volume-size-up.sql @@ -0,0 +1 @@ +alter table user_service_configs add digitalocean_volume_size integer; diff --git a/src/migrations/2-adding-deployments-instance-backup-down.sql b/src/migrations/2-adding-deployments-instance-backup-down.sql new file mode 100644 index 0000000..fe9adea --- /dev/null +++ b/src/migrations/2-adding-deployments-instance-backup-down.sql @@ -0,0 +1 @@ +alter table deployments drop column instance_backup; diff --git a/src/migrations/2-adding-deployments-instance-backup-up.sql b/src/migrations/2-adding-deployments-instance-backup-up.sql new file mode 100644 index 0000000..efc7db2 --- /dev/null +++ b/src/migrations/2-adding-deployments-instance-backup-up.sql @@ -0,0 +1 @@ +alter table deployments add instance_backup deployment_status not null default 'queued'; diff --git a/src/nassella.scm b/src/nassella.scm index 5909d33..debd7e0 100644 --- a/src/nassella.scm +++ b/src/nassella.scm @@ -723,23 +723,31 @@ h1, h2, h3, h4, h5, h6 { (with-output-to-file (string-append dir "/terraform.tfstate.backup") (lambda () (write-string state-backup)))) (define (parse-deployment-log log) - (define (search complete in-progress) - (cond ((irregex-search complete log) + (define (search complete in-progress failed) + (cond ((irregex-search failed log) + 'failed) + ((irregex-search complete log) 'complete) ((irregex-search in-progress log) 'in-progress) (else 'queued))) - `((generate-configs . ,(search "terraform apply" "NASSELLA_CONFIG: start")) + `((generate-configs . ,(search "terraform apply" "NASSELLA_CONFIG: start" "Failed to install provider")) ;; TODO this didn't seem to work right when upgrading the flatcar image ;; log: [0mdigitalocean_custom_image.flatcar: Creating... ;; digitalocean_custom_image.flatcar: Still creating... [00m10s elapsed] ;; digitalocean_custom_image.flatcar: Still creating... [00m20s elapsed] ;; digitalocean_custom_image.flatcar: Still creating... [00m30s elapsed] ;; digitalocean_custom_image.flatcar: Still creating... [00m40s elapsed] - (custom-image . ,(search "custom_image.flatcar: Modifications complete" "custom_image.flatcar: Modifying")) - (machine-create . ,(search "droplet.machine: Creation complete" "droplet.machine: Creating...")) + (custom-image . ,(search '(or "custom_image.flatcar: Modifications complete" "custom_image.flatcar: Creation complete") + '(or "custom_image.flatcar: Modifying" "custom_image.flatcar: Creating") + "XXX - nothing")) + (machine-create . ,(search "droplet.machine: Creation complete" "droplet.machine: Creating..." "XXX - nothing")) (machine-destroy . ,(search "droplet.machine: Destruction complete" - '(: "droplet.machine (deposed object " (* alphanum) "): Destroying..."))))) + '(: "droplet.machine (deposed object " (* alphanum) "): Destroying...") "XXX - nothing")) + (ip-destroy . ,(search "reserved_ip_assignment.machine: Destruction complete" "reserved_ip_assignment.machine: Destroying..." "XXX - nothing")) + (ip-create . ,(search "reserved_ip_assignment.machine: Creation complete" "reserved_ip_assignment.machine: Creating..." "Error Assigning reserved IP")) + (volume-create . ,(search "volume_attachment.machine: Creation complete" "volume_attachment.machine: Creating..." "XXX - nothing")) + (volume-destroy . ,(search "volume_attachment.machine: Destruction complete" "volume_attachment.machine: Destroying..." "XXX - nothing")))) (define (write-config-entry name value) (display name) @@ -1318,7 +1326,9 @@ chmod -R 777 /opt/keys"))) (method POST)) (VStack (Fieldset - (@ (title "Size")) + (@ (title "Instance Properties")) + (Field (@ (name "volume-size") (label ("Volume Size in GB (For persistent application storage)")) + (value ,(alist-ref 'digitalocean-volume-size config eq? "60")))) (Field (@ (element select) (name "size") (input-style ((max-width "100%")))) ,@(map (lambda (s) `(option (@ (value ,(alist-ref 'slug s)) ,@(if (equal? (alist-ref 'slug s) "s-2vcpu-2gb") `((selected "selected")) '())) @@ -1338,7 +1348,8 @@ chmod -R 777 /opt/keys"))) db (session-user-id) instance-id - `((digitalocean-size . ,(alist-ref 'size (current-params))))))) + `((digitalocean-size . ,(alist-ref 'size (current-params))) + (digitalocean-volume-size . ,(alist-ref 'volume-size (current-params))))))) (redirect (conc "/config/wizard/review/" instance-id)))) (get/widgets @@ -1375,6 +1386,7 @@ chmod -R 777 /opt/keys"))) (li "Size: " ,(alist-ref 'digitalocean-size service-config))) (form (@ (action ,(conc "/config/wizard/review-submit/" instance-id)) (method POST)) + (input (@ (type "hidden") (value ,(alist-ref 'force (current-params))) (name "force"))) (VStack (Form-Nav (@ (back-to ,(conc "/config/wizard/machine2/" instance-id)) (submit-button "Launch"))))))))) @@ -1386,7 +1398,8 @@ chmod -R 777 /opt/keys"))) (with-db/transaction (lambda (db) (get-most-recent-deployment-status db (session-user-id) instance-id))))))) - (when (not (or (eq? status 'queued) (eq? status 'in-progress))) + (when (or (not (or (eq? status 'queued) (eq? status 'in-progress))) + (equal? (alist-ref 'force (current-params)) "true")) (let* ((instance-id (alist-ref "id" (current-params) equal?)) (restic-snapshot-id (alist-ref 'restic-snapshot-id (current-params))) (results @@ -1466,21 +1479,66 @@ chmod -R 777 /opt/keys"))) (write-config-entry (car e) (cdr e))) `(("server_type" . ,(alist-ref 'digitalocean-size service-config)) ("do_token" . ,(alist-ref 'digitalocean-api-token service-config)) + ("digitalocean_volume_size" . ,(alist-ref 'digitalocean-volume-size service-config)) ("cloudflare_api_token" . ,(alist-ref 'cloudflare-api-token service-config)) ("cloudflare_zone_id" . ,(alist-ref 'cloudflare-zone-id service-config)) ("cloudflare_account_id" . ,(alist-ref 'cloudflare-account-id service-config)) ("cluster_name" . "nassella") ("datacenter" . ,(alist-ref 'digitalocean-region service-config)) ;; (source <(curl -sSfL https://stable.release.flatcar-linux.net/amd64-usr/current/version.txt); echo "${FLATCAR_VERSION_ID}") - ("flatcar_stable_version" . "4593.2.1"))) + ("flatcar_stable_version" . "4593.2.2"))) ;; remove the newline that generating the ssh key adds (display "ssh_keys=[\"") (display (string-drop-right ssh-pub-key 1)) (print "\"]")))) (let* ((instance-id (alist-ref "id" (current-params) equal?)) (user-id (session-user-id)) + (app-config + (with-db/transaction + (lambda (db) + (get-user-app-config db (session-user-id) instance-id)))) (deployment-id (with-db/transaction (lambda (db) (create-deployment db user-id instance-id)))) - (dir (deployment-directory user-id instance-id))) + (dir (deployment-directory user-id instance-id)) + (backup-request-id (conc (truncate (time->seconds (current-time))) "-" (pseudo-random-integer 10000)))) + (with-db/transaction + (lambda (db) + (update-deployment-progress db deployment-id '((instance-backup . in-progress))))) + (handle-exceptions + exn + (with-db/transaction + (lambda (db) + (update-deployment-progress db deployment-id '((instance-backup . failed))))) + (send-instance-control-command + (alist-ref 'root-domain app-config) + (alist-ref 'subdomain (alist-ref 'instance-control (alist-ref 'config app-config))) + "queue-restic-snapshot-no-restart" + (alist-ref 'webhooks-secret (alist-ref 'instance-control (alist-ref 'config app-config))) + `((path . "/") + (tag . "automated_pre_instance_update") + ;; effectively a guid, we just want something unique + (request_id . ,backup-request-id) + (version . 0)))) (thread-start! (lambda () + ;; TODO add a timeout and set status to failed and error if we hit the timeout + (let ((start-time (time->seconds (current-time)))) + (let loop () + (thread-sleep! 1) + (let* ((status-result + (handle-exceptions + exn + '((status . "error")) + (send-instance-control-command + (alist-ref 'root-domain app-config) + (alist-ref 'subdomain (alist-ref 'instance-control (alist-ref 'config app-config))) + "restic-snapshot-status" + (alist-ref 'webhooks-secret (alist-ref 'instance-control (alist-ref 'config app-config))) + `((request_id . ,backup-request-id) + (version . 0))))) + (complete (string=? (alist-ref 'status status-result) "complete"))) + (if (or complete (> (- (time->seconds (current-time)) start-time) 45)) + (with-db/transaction + (lambda (db) + (update-deployment-progress db deployment-id `((instance-backup . ,(or (and complete 'complete) 'failed)))))) + (loop))))) (change-directory dir) (let ((pid (process-run "make apply > make-out 2>&1"))) (with-db/transaction (lambda (db) (update-deployment-in-progress db deployment-id pid))) @@ -1512,15 +1570,9 @@ chmod -R 777 /opt/keys"))) (with-db/transaction (lambda (db) (update-deployment-progress db deployment-id progress) - ;; TODO THIS DOESN'T WORK RIGHT FOR TERRAFORM OP FAILURES - ;; like the random digital ocean error saying the IP can't be - ;; updated because another operation is in progress. - ;; it still registers as "success". - ;; probably need to also write stderr to a file and read/store/parse that? - ;; Should we parse make-out for string "Apply complete!" ? (update-deployment-status db user-id deployment-id - (if exit-normal 'complete 'failed) + (if (= status 0) 'complete 'failed) (with-input-from-file (string-append dir "/make-out") read-string)) (update-user-terraform-state db user-id instance-id (if (eof-object? tf-state) "" tf-state) @@ -1542,7 +1594,7 @@ chmod -R 777 /opt/keys"))) (lambda (db) `((status . ,(get-most-recent-deployment-status db (session-user-id) instance-id)) (progress . ,(get-most-recent-deployment-progress db (session-user-id) instance-id)))))) - (output (with-input-from-file (string-append (deployment-directory (session-user-id) instance-id) "/make-out") read-string)) + (output (handle-exceptions exn "" (with-input-from-file (string-append (deployment-directory (session-user-id) instance-id) "/make-out") read-string))) (progress (alist-ref 'progress res)) (status (alist-ref 'status res))) `(App @@ -1554,9 +1606,14 @@ chmod -R 777 /opt/keys"))) ((in-progress) "Deployment in progress") ((complete) "Deployment complete!") ((failed) "Deployment failed"))) - (ul (li "generate configs: " ,(progress-status->text (alist-ref 'generate-configs progress))) + (ul (li "perform backup: " ,(progress-status->text (alist-ref 'instance-backup progress))) + (li "generate configs: " ,(progress-status->text (alist-ref 'generate-configs progress))) (li "custom flatcar image: " ,(progress-status->text (alist-ref 'custom-image progress))) - (li "machine create: " ,(progress-status->text (alist-ref 'machine-create progress))) + (li "application volume disconnect: " ,(progress-status->text (alist-ref 'volume-destroy progress))) + (li "instance create: " ,(progress-status->text (alist-ref 'machine-create progress))) + (li "instance mapped ip disconnect: " ,(progress-status->text (alist-ref 'ip-destroy progress))) + (li "instance mapped ip connect: " ,(progress-status->text (alist-ref 'ip-create progress))) + (li "application volume connect: " ,(progress-status->text (alist-ref 'volume-create progress))) (li "cleanup previous machine: " ,(progress-status->text (alist-ref 'machine-destroy progress)))) (div (a (@ (href "/dashboard")) "Dashboard") @@ -1733,7 +1790,7 @@ chmod -R 777 /opt/keys"))) ("cluster_name" . "nassella") ("datacenter" . ,(alist-ref 'digitalocean-region service-config)) ;; (source <(curl -sSfL https://stable.release.flatcar-linux.net/amd64-usr/current/version.txt); echo "${FLATCAR_VERSION_ID}") - ("flatcar_stable_version" . "4593.2.1"))) + ("Flatcar_stable_version" . "4593.2.2"))) ;; remove the newline that generating the ssh key adds (display "ssh_keys=[\"") (display (string-drop-right ssh-pub-key 1)) (print "\"]"))) ;; TODO need a new table to track destroying? @@ -1825,6 +1882,7 @@ chmod -R 777 /opt/keys"))) `((ul (li "generate configs: " ,(progress-status->text (alist-ref 'generate-configs progress))) (li "custom flatcar image: " ,(progress-status->text (alist-ref 'custom-image progress))) (li "machine create: " ,(progress-status->text (alist-ref 'machine-create progress))) + (li "attaching : " ,(progress-status->text (alist-ref 'machine-create progress))) (li "cleanup previous machine: " ,(progress-status->text (alist-ref 'machine-destroy progress)))) (div (a (@ (href "/dashboard")) "Dashboard")