From c51e72686e7c66bbbbdd42dae5e746a7dccdcdd5 Mon Sep 17 00:00:00 2001
From: Peter Leurs <peter@pfoe.be>
Date: Fri, 15 Aug 2025 23:09:26 +0200
Subject: [PATCH 1/2] Rewrite logging logic

Now log everything in a file, but split it up into multiple files
The -debug variant can contain sensitive data, but the regular file
cannot.  This way we can use the non-debug variant for the monitoring
later on.

Note that the idea is that those files are empty if the last run was
successfull, and contain the error when something went wrong.
---
 backupmanager.j2 | 51 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/backupmanager.j2 b/backupmanager.j2
index b5f6fdf..063e48b 100644
--- a/backupmanager.j2
+++ b/backupmanager.j2
@@ -43,6 +43,23 @@ except BlockingIOError:
     print("Another instance is running.")
     sys.exit(1)
 
+# This file contains all problems with the last run.  Empty when we ran succesfully,
+# otherwise what needs attention...
+logfd = open("{{ homedir }}/.backupmanager.errors", "w")
+logfd2 = open("{{ homedir }}/.backupmanager.errors-debug", "w")
+
+def output_warning(msg, sensitive=""):
+    print(f"WARN: {msg} {sensitive}")
+    logfd.write(f"WARN: {msg}\n")
+    logfd2.write(f"WARN: {msg} {sensitive}\n")
+
+def output_fatal(msg, sensitive="", dontquityet=False):
+    print(f"FATAL: {msg} {sensitive}")
+    logfd.write(f"FATAL: {msg}\n")
+    logfd2.write(f"FATAL: {msg} {sensitive}\n")
+    if not dontquityet:
+        sys.exit(1)
+
 def run_command(command, env=None):
     current_env = os.environ.copy()
     if env is not None:
@@ -53,7 +70,7 @@ def run_command(command, env=None):
     sys.stdout.flush()
     sys.stderr.flush()
     if process.returncode != 0:
-        raise RuntimeError(f"Command '{command}' failed with return code {process.returncode}, environment is {env}")
+        output_fatal(f"Command '{command}' failed with return code {process.returncode}", sensitive=f"environment is {env}")
 
 def get_command(command, env=None, full_return=False):
     current_env = os.environ.copy()
@@ -71,7 +88,7 @@ def get_command(command, env=None, full_return=False):
     if error:
         print(error.decode().strip(), file=sys.stderr)
     if return_code != 0:
-        raise RuntimeError(f"Command '{command}' failed with return code {return_code}, environment is {env}")
+        output_fatal(f"Command '{command}' failed with return code {return_code}", sensitive=f"environment is {env}")
     return output.decode()
 
 def get_snapshot_info(repo):
@@ -128,23 +145,22 @@ for r in repos:
         repos_ok.append({"repo":r,"config":repoconfig})
 
 if len(repos_ok)==0:
-    print("None of the repositories can be accessed.  At least one must be reachable for me to output repository init commands")
-    sys.exit(1)
+    output_fatal("None of the repositories can be accessed.  At least one must be reachable for me to output repository init commands")
 
 if polynomial_ok==False:
-    print("Not all repositories have the same chunker polynomial configured.  This can ONLY be configured when the repository is first created.  Please delete the repositories you can rebuild...")
+    msg = "Not all repositories have the same chunker polynomial configured.  This can ONLY be configured when the repository is first created.  Please delete the repositories you can rebuild...\n"
+    s = ""
     for r in repos_ok:
-        print(f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}")
-    sys.exit(1)
+        s = s + f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}\n"
+    output_fatal(msg, sensitive=s)
 
 if len(repos_in_error)!=0:
-    print("Could not open all repositories.  Check that they are accessible and that the passwords are correct.  If they are not yet initialized, use the following commands:")
+    msg = "Could not open all repositories.  Check that they are accessible and that the passwords are correct."
+    s = "If they are not yet initialized, use the following commands:\n\n"
     repo_from=repos_ok[0]['repo']
     for r in repos_in_error:
-        print()
-        print(f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}")
-    sys.exit(1)
-
+        s = s + f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}\n"
+    output_fatal(msg, sensitive=s)
 
 for r in repos:
     print(f"Getting snapshot list for repo {r['name']}")
@@ -160,11 +176,10 @@ for r in repos:
         if s['hostname'] not in r['hosts']:
             wronghosts.update(s['hostname'])
     if wronghosts:
-        print(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list")
+        output_fatal(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list", dontquityet=True)
         hostsok = False
 if not hostsok:
-    print("Host information not ok, aborting")
-    sys.exit(1)
+    output_fatal("Host information not ok, aborting")
 
 for host in allhosts:
     print(f"Syncing hostname {host}")
@@ -194,7 +209,7 @@ for host in allhosts:
                 most_recent_backup_on = r
                 most_recent_backup_id = s['id']
     if most_recent_backup_ts == 0:
-        print(f"WARNING: There are no backups for {host}")
+        output_warning(f"There are no backups for {host}")
         continue
 
     # We now know the most recent backup.  See if a backup is present on all targets that carry this hostname.
@@ -228,9 +243,9 @@ for host in allhosts:
         timeout = hosts[host]["timeout"]
     if not idle:
         if not have_a_copy:
-            print(f"WARNING: We do not have a copy for {host}")
+            output_warning(f"We do not have a copy for {host}")
         if most_recent_backup_ts < time.time()-(timeout*24*3600):
-            print(f"WARNING: Last backup for {host} is too old")
+            output_warning(f"Last backup for {host} is too old")
 
 for r in repos:
     e = {}

From 8c603d6ebd1e78829f2e9547dea056d6ab62aafe Mon Sep 17 00:00:00 2001
From: Peter Leurs <peter@pfoe.be>
Date: Fri, 15 Aug 2025 23:09:48 +0200
Subject: [PATCH 2/2] add monitoring scripts

---
 appinfo.yml         | 29 +++++++++++++++++++++++++++++
 monitor-test.sh.j2  | 16 ++++++++++++++++
 monitor.sh.j2       | 18 ++++++++++++++++++
 monitor.socket.j2   |  9 +++++++++
 monitor@.service.j2 | 15 +++++++++++++++
 5 files changed, 87 insertions(+)
 create mode 100644 monitor-test.sh.j2
 create mode 100644 monitor.sh.j2
 create mode 100644 monitor.socket.j2
 create mode 100644 monitor@.service.j2

diff --git a/appinfo.yml b/appinfo.yml
index 6eb491a..d16b1be 100644
--- a/appinfo.yml
+++ b/appinfo.yml
@@ -6,6 +6,16 @@ templatefiles:
   - src: backupmanager.j2
     dest: ~/backupmanager
     mode: "0755"
+  - src: monitor@.service.j2
+    dest: ~/.config/systemd/user/monitor@.service
+  - src: monitor.socket.j2
+    dest: ~/.config/systemd/user/monitor.socket
+  - src: monitor.sh.j2
+    dest: ~/monitor.sh
+    mode: "0755"
+  - src: monitor-test.sh.j2
+    dest: ~/monitor-test.sh
+    mode: "0755"
 backuptype: none
 configdefinition:
   "$id": "backup manager config"
@@ -72,3 +82,22 @@ configdefinition:
                     description: Repositories where the latest backup is older then timeout days will give a warning that there are no recent backups
   required:
     - repos
+exports:
+  monitoring:
+    checks:
+       - name: main
+         message: Backup manager activated
+         interval: 3600
+         type: string
+         okvalue: active
+       - name: lastrun
+         message: Errors last run
+         interval: 3600
+         type: string
+         okvalue: ""
+       - name: lastrunrecent
+         message: Backup manager did not run recently
+         interval: 3600
+         type: string
+         okvalue: "OK"
+
diff --git a/monitor-test.sh.j2 b/monitor-test.sh.j2
new file mode 100644
index 0000000..a1a125e
--- /dev/null
+++ b/monitor-test.sh.j2
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Usage: ./execute_command_client.sh <command>
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <command>"
+  exit 1
+fi
+
+command="$1"
+
+# Connect to socket and send command
+output=$(nc -U {{ statedir }}/{{ name }}.monitoring <<< "$command")
+
+# Print output
+echo "$output"
diff --git a/monitor.sh.j2 b/monitor.sh.j2
new file mode 100644
index 0000000..0e8e932
--- /dev/null
+++ b/monitor.sh.j2
@@ -0,0 +1,18 @@
+#!/bin/bash
+read -r command
+case $command in
+  main)
+    systemctl is-active --user backup-manager.timer
+    ;;
+  lastrun)
+    cat {{ homedir }}/.backupmanager.errors
+    ;;
+  lastrunrecent)
+    [ -f {{ homedir }}/.backupmanager.errors ] && [ $(find {{ homedir }}/.backupmanager.errors -mtime -2) ] && echo "OK" || echo "outdated"
+    ;;
+  *)
+    echo "Parameter unknown"
+    ;;
+esac
+
+
diff --git a/monitor.socket.j2 b/monitor.socket.j2
new file mode 100644
index 0000000..17163d1
--- /dev/null
+++ b/monitor.socket.j2
@@ -0,0 +1,9 @@
+[Unit]
+Description=Execute Command Socket
+
+[Socket]
+ListenStream={{ statedir }}/{{ name }}.monitoring
+Accept=yes
+
+[Install]
+WantedBy=sockets.target
diff --git a/monitor@.service.j2 b/monitor@.service.j2
new file mode 100644
index 0000000..55ee896
--- /dev/null
+++ b/monitor@.service.j2
@@ -0,0 +1,15 @@
+[Unit]
+Description=Execute Command Service
+After=network.target
+Requires=monitor.socket
+
+[Service]
+Type=simple
+ExecStart={{ homedir }}/monitor.sh
+StandardInput=socket
+StandardOutput=socket
+TimeoutStopSec=5
+RuntimeMaxSec=10
+
+[Install]
+WantedBy=sockets.target
\ No newline at end of file