Skip to content

Commit af8ec91

Browse files
authored
feat: Add support for email based alerting (#591)
1 parent 55ca440 commit af8ec91

File tree

3 files changed

+116
-4
lines changed

3 files changed

+116
-4
lines changed

modules/backup/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,25 @@ fetch workflows.googleapis.com/Workflow
5353

5454
| Name | Description | Type | Default | Required |
5555
|------|-------------|------|---------|:--------:|
56+
| backup\_monitoring\_frequency | Timeframe in which there should be at least one successfull backup | `string` | `"1d"` | no |
5657
| backup\_retention\_time | The number of days backups should be kept | `number` | `30` | no |
5758
| backup\_runs\_list\_max\_results | The max amount of backups to list when fetching internal backup runs for the instance. This number must be larger then the amount of backups you wish to keep. E.g. for a daily backup schedule and a backup\_retention\_time of 30 days, you'd need to set this to at least 31 for old backups to get deleted. | `number` | `31` | no |
5859
| backup\_schedule | The cron schedule to execute the internal backup | `string` | `"45 2 * * *"` | no |
5960
| compress\_export | Whether or not to compress the export when storing in the bucket; Only valid for MySQL and PostgreSQL | `bool` | `true` | no |
6061
| connector\_params\_timeout | The end-to-end duration the connector call is allowed to run for before throwing a timeout exception. The default value is 1800 and this should be the maximum for connector methods that are not long-running operations. Otherwise, for long-running operations, the maximum timeout for a connector call is 31536000 seconds (one year). | `number` | `1800` | no |
62+
| create\_email\_notification\_channel | Create email notification channel to send alerts | `bool` | `false` | no |
63+
| email\_notification\_channel\_name | Name of email notification channel | `string` | `"Email Notification"` | no |
64+
| enable\_backup\_monitoring | Whether to monitor backup workflows or not | `bool` | `false` | no |
6165
| enable\_connector\_params | Whether to enable connector-specific parameters for Google Workflow SQL Export. | `bool` | `false` | no |
6266
| enable\_export\_backup | Weather to create exports to GCS Buckets with this module | `bool` | `true` | no |
67+
| enable\_export\_monitoring | Whether to monitor export workflows or not | `bool` | `false` | no |
6368
| enable\_internal\_backup | Wether to create internal backups with this module | `bool` | `true` | no |
6469
| export\_databases | The list of databases that should be exported - if is an empty set all databases will be exported | `set(string)` | `[]` | no |
70+
| export\_monitoring\_frequency | Timeframe in which there should be at least one successfull export | `string` | `"1d"` | no |
6571
| export\_schedule | The cron schedule to execute the export to GCS | `string` | `"15 3 * * *"` | no |
6672
| export\_uri | The bucket and path uri for exporting to GCS | `string` | n/a | yes |
6773
| log\_db\_name\_to\_export | Whether or not to log database name in the export workflow | `bool` | `false` | no |
74+
| monitoring\_email | Email address to send alerts | `string` | `null` | no |
6875
| project\_id | The project ID | `string` | n/a | yes |
6976
| region | The region where to run the workflow | `string` | `"us-central1"` | no |
7077
| scheduler\_timezone | The Timezone in which the Scheduler Jobs are triggered | `string` | `"Etc/GMT"` | no |

modules/backup/main.tf

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
locals {
1919
create_service_account = var.service_account == null || var.service_account == "" ? true : false
2020
service_account = local.create_service_account ? google_service_account.sql_backup_serviceaccount[0].email : var.service_account
21+
backup_name = "sql-backup-${var.sql_instance}${var.unique_suffix}"
22+
export_name = var.use_sql_instance_replica_in_exporter ? "sql-export-${var.sql_instance_replica}${var.unique_suffix}" : "sql-export-${var.sql_instance}${var.unique_suffix}"
2123
}
2224

2325

@@ -52,14 +54,23 @@ data "google_sql_database_instance" "backup_instance" {
5254
project = var.project_id
5355
}
5456

57+
resource "google_monitoring_notification_channel" "email" {
58+
count = var.create_email_notification_channel ? 1 : 0
59+
display_name = var.email_notification_channel_name
60+
type = "email"
61+
labels = {
62+
email_address = var.monitoring_email
63+
}
64+
}
65+
5566
################################
5667
# #
5768
# Internal Backups #
5869
# #
5970
################################
6071
resource "google_workflows_workflow" "sql_backup" {
6172
count = var.enable_internal_backup ? 1 : 0
62-
name = "sql-backup-${var.sql_instance}${var.unique_suffix}"
73+
name = local.backup_name
6374
region = var.region
6475
description = "Workflow for backing up the CloudSQL Instance "
6576
project = var.project_id
@@ -74,7 +85,7 @@ resource "google_workflows_workflow" "sql_backup" {
7485

7586
resource "google_cloud_scheduler_job" "sql_backup" {
7687
count = var.enable_internal_backup ? 1 : 0
77-
name = "sql-backup-${var.sql_instance}${var.unique_suffix}"
88+
name = local.backup_name
7889
project = var.project_id
7990
region = var.region
8091
description = "Managed by Terraform - Triggers a SQL Backup via Workflows"
@@ -91,14 +102,40 @@ resource "google_cloud_scheduler_job" "sql_backup" {
91102
}
92103
}
93104

105+
# We want to get notified if there hasn't been at least one successful backup in a day
106+
resource "google_monitoring_alert_policy" "sql_backup_workflow_success_alert" {
107+
count = var.enable_internal_backup && var.enable_backup_monitoring ? 1 : 0
108+
display_name = "Failed workflow: ${local.backup_name}"
109+
combiner = "OR"
110+
111+
conditions {
112+
display_name = "Failed workflow: ${local.backup_name}"
113+
condition_monitoring_query_language {
114+
query = <<-EOT
115+
fetch workflows.googleapis.com/Workflow
116+
| filter workflow_id == '${local.backup_name}'
117+
| metric 'workflows.googleapis.com/finished_execution_count'
118+
| filter metric.status == 'SUCCEEDED'
119+
| group_by ${var.backup_monitoring_frequency}, [value_finished_execution_count_sum: sum(value.finished_execution_count)]
120+
| every ${var.backup_monitoring_frequency}
121+
| condition val() < 1 '1'
122+
EOT
123+
duration = "3600s"
124+
trigger { count = 1 }
125+
evaluation_missing_data = "EVALUATION_MISSING_DATA_ACTIVE"
126+
}
127+
}
128+
notification_channels = [google_monitoring_notification_channel.email[0].id]
129+
}
130+
94131
################################
95132
# #
96133
# External Backups #
97134
# #
98135
################################
99136
resource "google_workflows_workflow" "sql_export" {
100137
count = var.enable_export_backup ? 1 : 0
101-
name = var.use_sql_instance_replica_in_exporter ? "sql-export-${var.sql_instance_replica}${var.unique_suffix}" : "sql-export-${var.sql_instance}${var.unique_suffix}"
138+
name = local.export_name
102139
region = var.region
103140
description = "Workflow for backing up the CloudSQL Instance"
104141
project = var.project_id
@@ -120,7 +157,7 @@ resource "google_workflows_workflow" "sql_export" {
120157

121158
resource "google_cloud_scheduler_job" "sql_export" {
122159
count = var.enable_export_backup ? 1 : 0
123-
name = var.use_sql_instance_replica_in_exporter ? "sql-export-${var.sql_instance_replica}${var.unique_suffix}" : "sql-export-${var.sql_instance}${var.unique_suffix}"
160+
name = local.export_name
124161
project = var.project_id
125162
region = var.region
126163
description = "Managed by Terraform - Triggers a SQL Export via Workflows"
@@ -143,3 +180,29 @@ resource "google_storage_bucket_iam_member" "sql_instance_account" {
143180
member = "serviceAccount:${data.google_sql_database_instance.backup_instance.service_account_email_address}"
144181
role = "roles/storage.objectCreator"
145182
}
183+
184+
# We want to get notified if there hasn't been at least one successful backup in a day
185+
resource "google_monitoring_alert_policy" "sql_export_workflow_success_alert" {
186+
count = var.enable_export_backup && var.enable_export_monitoring ? 1 : 0
187+
display_name = "Failed workflow: ${local.export_name}"
188+
combiner = "OR"
189+
190+
conditions {
191+
display_name = "Failed workflow: ${local.export_name}"
192+
condition_monitoring_query_language {
193+
query = <<-EOT
194+
fetch workflows.googleapis.com/Workflow
195+
| filter workflow_id == '${local.export_name}'
196+
| metric 'workflows.googleapis.com/finished_execution_count'
197+
| filter metric.status == 'SUCCEEDED'
198+
| group_by ${var.export_monitoring_frequency}, [value_finished_execution_count_sum: sum(value.finished_execution_count)]
199+
| every ${var.export_monitoring_frequency}
200+
| condition val() < 1 '1'
201+
EOT
202+
duration = "3600s"
203+
trigger { count = 1 }
204+
evaluation_missing_data = "EVALUATION_MISSING_DATA_ACTIVE"
205+
}
206+
}
207+
notification_channels = [google_monitoring_notification_channel.email[0].id]
208+
}

modules/backup/variables.tf

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,45 @@ variable "use_serverless_export" {
144144
type = bool
145145
default = false
146146
}
147+
148+
variable "monitoring_email" {
149+
description = "Email address to send alerts"
150+
type = string
151+
default = null
152+
}
153+
154+
variable "enable_backup_monitoring" {
155+
description = "Whether to monitor backup workflows or not"
156+
type = bool
157+
default = false
158+
}
159+
160+
variable "backup_monitoring_frequency" {
161+
description = "Timeframe in which there should be at least one successfull backup"
162+
type = string
163+
default = "1d"
164+
}
165+
166+
variable "enable_export_monitoring" {
167+
description = "Whether to monitor export workflows or not"
168+
type = bool
169+
default = false
170+
}
171+
172+
variable "export_monitoring_frequency" {
173+
description = "Timeframe in which there should be at least one successfull export"
174+
type = string
175+
default = "1d"
176+
}
177+
178+
variable "create_email_notification_channel" {
179+
description = "Create email notification channel to send alerts"
180+
type = bool
181+
default = false
182+
}
183+
184+
variable "email_notification_channel_name" {
185+
description = "Name of email notification channel"
186+
type = string
187+
default = "Email Notification"
188+
}

0 commit comments

Comments
 (0)