Compare commits

..

2 commits

Author SHA1 Message Date
8c603d6ebd add monitoring scripts 2025-08-15 23:09:48 +02:00
c51e72686e Rewrite logging logic
Now log everything in a file, but split it up into multiple files
The -debug variant can contain sensitive data, but the regular file
cannot.  This way we can use the non-debug variant for the monitoring
later on.

Note that the idea is that those files are empty if the last run was
successfull, and contain the error when something went wrong.
2025-08-15 23:09:26 +02:00
6 changed files with 120 additions and 18 deletions

View file

@ -6,6 +6,16 @@ templatefiles:
- src: backupmanager.j2 - src: backupmanager.j2
dest: ~/backupmanager dest: ~/backupmanager
mode: "0755" mode: "0755"
- src: monitor@.service.j2
dest: ~/.config/systemd/user/monitor@.service
- src: monitor.socket.j2
dest: ~/.config/systemd/user/monitor.socket
- src: monitor.sh.j2
dest: ~/monitor.sh
mode: "0755"
- src: monitor-test.sh.j2
dest: ~/monitor-test.sh
mode: "0755"
backuptype: none backuptype: none
configdefinition: configdefinition:
"$id": "backup manager config" "$id": "backup manager config"
@ -72,3 +82,22 @@ configdefinition:
description: Repositories where the latest backup is older then timeout days will give a warning that there are no recent backups description: Repositories where the latest backup is older then timeout days will give a warning that there are no recent backups
required: required:
- repos - repos
exports:
monitoring:
checks:
- name: main
message: Backup manager activated
interval: 3600
type: string
okvalue: active
- name: lastrun
message: Errors last run
interval: 3600
type: string
okvalue: ""
- name: lastrunrecent
message: Backup manager did not run recently
interval: 3600
type: string
okvalue: "OK"

View file

@ -43,6 +43,23 @@ except BlockingIOError:
print("Another instance is running.") print("Another instance is running.")
sys.exit(1) sys.exit(1)
# This file contains all problems with the last run. Empty when we ran succesfully,
# otherwise what needs attention...
logfd = open("{{ homedir }}/.backupmanager.errors", "w")
logfd2 = open("{{ homedir }}/.backupmanager.errors-debug", "w")
def output_warning(msg, sensitive=""):
print(f"WARN: {msg} {sensitive}")
logfd.write(f"WARN: {msg}\n")
logfd2.write(f"WARN: {msg} {sensitive}\n")
def output_fatal(msg, sensitive="", dontquityet=False):
print(f"FATAL: {msg} {sensitive}")
logfd.write(f"FATAL: {msg}\n")
logfd2.write(f"FATAL: {msg} {sensitive}\n")
if not dontquityet:
sys.exit(1)
def run_command(command, env=None): def run_command(command, env=None):
current_env = os.environ.copy() current_env = os.environ.copy()
if env is not None: if env is not None:
@ -53,7 +70,7 @@ def run_command(command, env=None):
sys.stdout.flush() sys.stdout.flush()
sys.stderr.flush() sys.stderr.flush()
if process.returncode != 0: if process.returncode != 0:
raise RuntimeError(f"Command '{command}' failed with return code {process.returncode}, environment is {env}") output_fatal(f"Command '{command}' failed with return code {process.returncode}", sensitive=f"environment is {env}")
def get_command(command, env=None, full_return=False): def get_command(command, env=None, full_return=False):
current_env = os.environ.copy() current_env = os.environ.copy()
@ -71,7 +88,7 @@ def get_command(command, env=None, full_return=False):
if error: if error:
print(error.decode().strip(), file=sys.stderr) print(error.decode().strip(), file=sys.stderr)
if return_code != 0: if return_code != 0:
raise RuntimeError(f"Command '{command}' failed with return code {return_code}, environment is {env}") output_fatal(f"Command '{command}' failed with return code {return_code}", sensitive=f"environment is {env}")
return output.decode() return output.decode()
def get_snapshot_info(repo): def get_snapshot_info(repo):
@ -128,23 +145,22 @@ for r in repos:
repos_ok.append({"repo":r,"config":repoconfig}) repos_ok.append({"repo":r,"config":repoconfig})
if len(repos_ok)==0: if len(repos_ok)==0:
print("None of the repositories can be accessed. At least one must be reachable for me to output repository init commands") output_fatal("None of the repositories can be accessed. At least one must be reachable for me to output repository init commands")
sys.exit(1)
if polynomial_ok==False: if polynomial_ok==False:
print("Not all repositories have the same chunker polynomial configured. This can ONLY be configured when the repository is first created. Please delete the repositories you can rebuild...") msg = "Not all repositories have the same chunker polynomial configured. This can ONLY be configured when the repository is first created. Please delete the repositories you can rebuild...\n"
s = ""
for r in repos_ok: for r in repos_ok:
print(f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}") s = s + f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}\n"
sys.exit(1) output_fatal(msg, sensitive=s)
if len(repos_in_error)!=0: if len(repos_in_error)!=0:
print("Could not open all repositories. Check that they are accessible and that the passwords are correct. If they are not yet initialized, use the following commands:") msg = "Could not open all repositories. Check that they are accessible and that the passwords are correct."
s = "If they are not yet initialized, use the following commands:\n\n"
repo_from=repos_ok[0]['repo'] repo_from=repos_ok[0]['repo']
for r in repos_in_error: for r in repos_in_error:
print() s = s + f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}\n"
print(f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}") output_fatal(msg, sensitive=s)
sys.exit(1)
for r in repos: for r in repos:
print(f"Getting snapshot list for repo {r['name']}") print(f"Getting snapshot list for repo {r['name']}")
@ -160,11 +176,10 @@ for r in repos:
if s['hostname'] not in r['hosts']: if s['hostname'] not in r['hosts']:
wronghosts.update(s['hostname']) wronghosts.update(s['hostname'])
if wronghosts: if wronghosts:
print(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list") output_fatal(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list", dontquityet=True)
hostsok = False hostsok = False
if not hostsok: if not hostsok:
print("Host information not ok, aborting") output_fatal("Host information not ok, aborting")
sys.exit(1)
for host in allhosts: for host in allhosts:
print(f"Syncing hostname {host}") print(f"Syncing hostname {host}")
@ -194,7 +209,7 @@ for host in allhosts:
most_recent_backup_on = r most_recent_backup_on = r
most_recent_backup_id = s['id'] most_recent_backup_id = s['id']
if most_recent_backup_ts == 0: if most_recent_backup_ts == 0:
print(f"WARNING: There are no backups for {host}") output_warning(f"There are no backups for {host}")
continue continue
# We now know the most recent backup. See if a backup is present on all targets that carry this hostname. # We now know the most recent backup. See if a backup is present on all targets that carry this hostname.
@ -228,9 +243,9 @@ for host in allhosts:
timeout = hosts[host]["timeout"] timeout = hosts[host]["timeout"]
if not idle: if not idle:
if not have_a_copy: if not have_a_copy:
print(f"WARNING: We do not have a copy for {host}") output_warning(f"We do not have a copy for {host}")
if most_recent_backup_ts < time.time()-(timeout*24*3600): if most_recent_backup_ts < time.time()-(timeout*24*3600):
print(f"WARNING: Last backup for {host} is too old") output_warning(f"Last backup for {host} is too old")
for r in repos: for r in repos:
e = {} e = {}

16
monitor-test.sh.j2 Normal file
View file

@ -0,0 +1,16 @@
#!/bin/bash
# Usage: ./execute_command_client.sh <command>
if [ $# -ne 1 ]; then
echo "Usage: $0 <command>"
exit 1
fi
command="$1"
# Connect to socket and send command
output=$(nc -U {{ statedir }}/{{ name }}.monitoring <<< "$command")
# Print output
echo "$output"

18
monitor.sh.j2 Normal file
View file

@ -0,0 +1,18 @@
#!/bin/bash
read -r command
case $command in
main)
systemctl is-active --user backup-manager.timer
;;
lastrun)
cat {{ homedir }}/.backupmanager.errors
;;
lastrunrecent)
[ -f {{ homedir }}/.backupmanager.errors ] && [ $(find {{ homedir }}/.backupmanager.errors -mtime -2) ] && echo "OK" || echo "outdated"
;;
*)
echo "Parameter unknown"
;;
esac

9
monitor.socket.j2 Normal file
View file

@ -0,0 +1,9 @@
[Unit]
Description=Execute Command Socket
[Socket]
ListenStream={{ statedir }}/{{ name }}.monitoring
Accept=yes
[Install]
WantedBy=sockets.target

15
monitor@.service.j2 Normal file
View file

@ -0,0 +1,15 @@
[Unit]
Description=Execute Command Service
After=network.target
Requires=monitor.socket
[Service]
Type=simple
ExecStart={{ homedir }}/monitor.sh
StandardInput=socket
StandardOutput=socket
TimeoutStopSec=5
RuntimeMaxSec=10
[Install]
WantedBy=sockets.target