backup-manager/backupmanager.j2
Peter Leurs c51e72686e Rewrite logging logic
Now log everything in a file, but split it up into multiple files
The -debug variant can contain sensitive data, but the regular file
cannot.  This way we can use the non-debug variant for the monitoring
later on.

Note that the idea is that those files are empty if the last run was
successfull, and contain the error when something went wrong.
2025-08-15 23:09:26 +02:00

295 lines
11 KiB
Django/Jinja

#!/usr/bin/python3
#### Config
repos = []
{% for r in config.repos %}
_hosts=[]
{% for h in r.hosts %}
_hosts.append("{{ h }}")
{% endfor %}
repos.append({
"name": "{{ r.name }}",
"url": "{{ r.url }}",
"key": "{{ r.key }}",
"hosts": _hosts,
"cleanup_parameters": "{{ r.cleanup_parameters | default(config.default_cleanup_parameters) }}",
"check_repo": {% if r.check_repo %}True{% else %}False{% endif %},
})
{% endfor %}
{% macro to_python(d) %}{
{% for key, value in d.items() %}
"{{ key }}": {% if value is boolean %}{% if value %}True{% else %}False{% endif %}{% elif value is mapping %}{{ to_python(value) }}{% else %}{{ value | tojson }}{% endif %},
{% endfor %}
}
{% endmacro %}
hosts = {{ to_python(config.hosts) }}
#### End config
import subprocess
import os
import sys
import json
import datetime
import time
import fcntl
lockfd = open("{{ homedir }}/.backupmanager.lock", "w")
try:
fcntl.flock(lockfd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
print("Another instance is running.")
sys.exit(1)
# This file contains all problems with the last run. Empty when we ran succesfully,
# otherwise what needs attention...
logfd = open("{{ homedir }}/.backupmanager.errors", "w")
logfd2 = open("{{ homedir }}/.backupmanager.errors-debug", "w")
def output_warning(msg, sensitive=""):
print(f"WARN: {msg} {sensitive}")
logfd.write(f"WARN: {msg}\n")
logfd2.write(f"WARN: {msg} {sensitive}\n")
def output_fatal(msg, sensitive="", dontquityet=False):
print(f"FATAL: {msg} {sensitive}")
logfd.write(f"FATAL: {msg}\n")
logfd2.write(f"FATAL: {msg} {sensitive}\n")
if not dontquityet:
sys.exit(1)
def run_command(command, env=None):
current_env = os.environ.copy()
if env is not None:
current_env.update(env)
sys.stdout.flush()
sys.stderr.flush()
process = subprocess.run(command, shell=True, env=current_env)
sys.stdout.flush()
sys.stderr.flush()
if process.returncode != 0:
output_fatal(f"Command '{command}' failed with return code {process.returncode}", sensitive=f"environment is {env}")
def get_command(command, env=None, full_return=False):
current_env = os.environ.copy()
if env is not None:
current_env.update(env)
sys.stdout.flush()
sys.stderr.flush()
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=current_env)
output, error = process.communicate()
sys.stdout.flush()
sys.stderr.flush()
return_code = process.poll()
if full_return:
return output.decode().strip(), error.decode().strip(), return_code
if error:
print(error.decode().strip(), file=sys.stderr)
if return_code != 0:
output_fatal(f"Command '{command}' failed with return code {return_code}", sensitive=f"environment is {env}")
return output.decode()
def get_snapshot_info(repo):
e = {}
e["RESTIC_REPOSITORY"]=repo["url"]
e["RESTIC_PASSWORD"]=repo["key"]
data = get_command(f"restic snapshots --json", env=e)
return json.loads(data)
# Define the path to the JSON file
STATUS_FILE = "{{ homedir }}/status.json"
def get_status(name):
if not os.path.exists(STATUS_FILE):
return False
with open(STATUS_FILE, 'r') as file:
try:
statuses = json.load(file)
except json.JSONDecodeError:
return False # Return False if the file is empty or corrupted
return statuses.get(name, False)
def set_status(name, value):
statuses = {}
if os.path.exists(STATUS_FILE):
with open(STATUS_FILE, 'r') as file:
try:
statuses = json.load(file)
except json.JSONDecodeError:
pass # Ignore errors and start with an empty dictionary
statuses[name] = value
with open(STATUS_FILE, 'w') as file:
json.dump(statuses, file, indent=4)
# Check that we can open the repositories properly
repos_in_error=[]
repos_ok=[]
polynomial=""
polynomial_ok=True
for r in repos:
print(f"Checking repo {r['name']}")
output,error,rc = get_command("restic cat config --json",{"RESTIC_REPOSITORY": r['url'],"RESTIC_PASSWORD": r['key'] }, full_return=True)
if rc!=0:
repos_in_error.append(r)
else:
repoconfig=json.loads(output)
if polynomial=="":
polynomial=repoconfig["chunker_polynomial"]
else:
if polynomial!=repoconfig["chunker_polynomial"]:
polynomial_ok=False
repos_ok.append({"repo":r,"config":repoconfig})
if len(repos_ok)==0:
output_fatal("None of the repositories can be accessed. At least one must be reachable for me to output repository init commands")
if polynomial_ok==False:
msg = "Not all repositories have the same chunker polynomial configured. This can ONLY be configured when the repository is first created. Please delete the repositories you can rebuild...\n"
s = ""
for r in repos_ok:
s = s + f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}\n"
output_fatal(msg, sensitive=s)
if len(repos_in_error)!=0:
msg = "Could not open all repositories. Check that they are accessible and that the passwords are correct."
s = "If they are not yet initialized, use the following commands:\n\n"
repo_from=repos_ok[0]['repo']
for r in repos_in_error:
s = s + f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}\n"
output_fatal(msg, sensitive=s)
for r in repos:
print(f"Getting snapshot list for repo {r['name']}")
r['snapshots'] = get_snapshot_info(r)
# Verify that we have the correct backups stored
allhosts = set()
hostsok = True
for r in repos:
wronghosts = set()
allhosts.update(r['hosts'])
for s in r['snapshots']:
if s['hostname'] not in r['hosts']:
wronghosts.update(s['hostname'])
if wronghosts:
output_fatal(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list", dontquityet=True)
hostsok = False
if not hostsok:
output_fatal("Host information not ok, aborting")
for host in allhosts:
print(f"Syncing hostname {host}")
most_recent_backup_ts = 0
most_recent_backup_str = ""
most_recent_backup_on = {}
most_recent_backup_id = ""
for r in repos:
for s in r['snapshots']:
if s['hostname']!=host:
continue
time_string = s["time"]
# Python does not accept a : in the timezone, yet go's code places it. Fix this...
# time_string = time_string[:-6] + time_string[-6:].replace(":","")
# Python only accepts 6 digits in the fractals for the seconds....
frac_seconds_str = time_string.split('.')[-1].rstrip('Z')
frac_seconds_len = len(frac_seconds_str)
# Truncate or pad the fractional seconds string as needed
if frac_seconds_len >= 4:
frac_seconds_str = frac_seconds_str[:4]
time_string = time_string[:-frac_seconds_len] + frac_seconds_str + 'Z'
unix_time = datetime.datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp()
if unix_time>most_recent_backup_ts:
most_recent_backup_ts = unix_time
most_recent_backup_str = s["time"]
most_recent_backup_on = r
most_recent_backup_id = s['id']
if most_recent_backup_ts == 0:
output_warning(f"There are no backups for {host}")
continue
# We now know the most recent backup. See if a backup is present on all targets that carry this hostname.
have_a_copy=False
for r in repos:
if host not in r['hosts']:
continue
if r['url']==most_recent_backup_on['url']:
continue
have_a_copy = True
copy_exists = False
for s in r['snapshots']:
if s['time']==most_recent_backup_str:
copy_exists = True
if copy_exists:
continue
# Copy!
print(f"Copying backup {most_recent_backup_id} from {most_recent_backup_on['name']} to {r['name']}")
e = {}
e["RESTIC_REPOSITORY"]=r["url"]
e["RESTIC_PASSWORD"]=r["key"]
e["RESTIC_FROM_REPOSITORY"]=most_recent_backup_on["url"]
e["RESTIC_FROM_PASSWORD"]=most_recent_backup_on["key"]
run_command(f"restic copy {most_recent_backup_id}", env=e)
idle = False
timeout = 10
if host in hosts:
idle = hosts[host]["idle"]
timeout = hosts[host]["timeout"]
if not idle:
if not have_a_copy:
output_warning(f"We do not have a copy for {host}")
if most_recent_backup_ts < time.time()-(timeout*24*3600):
output_warning(f"Last backup for {host} is too old")
for r in repos:
e = {}
e["RESTIC_REPOSITORY"]=r["url"]
e["RESTIC_PASSWORD"]=r["key"]
name = "repo_check_" + r["name"]
name2 = "repo_check_ok_" + r["name"]
if r["check_repo"]:
current_time = int(time.time()) # Get the current Unix timestamp
last_execution_time = get_status(name)
# only check once every 12h. If it is ok, we check 5%, so on average it takes 20 days
# to read the entire repo if we are extreamly lucky with the random. But scanning 100%
# is very expensive, so we don't do it, unless we are not sure it is all ok.
# So always scan 100% unless we know it is ok already from a previous scan.
# if a scan fails, we reset the ok flag before scanning, so next time it will be back 100%
if last_execution_time is False or (current_time - last_execution_time) > (12 * 60 * 60):
repo_ok = get_status(name2)
set_status(name2, False)
if repo_ok:
print(f"Checking random part of the backups for repo {r['name']}....")
run_command(f"restic check --read-data-subset=5%", env=e)
else:
print(f"Checking full backup of the backups for repo {r['name']}....")
run_command("restic check", env=e)
set_status(name2, True)
set_status(name, current_time)
for r in repos:
e = {}
e["RESTIC_REPOSITORY"]=r["url"]
e["RESTIC_PASSWORD"]=r["key"]
name = "repo_purge_" + r["name"]
current_time = int(time.time()) # Get the current Unix timestamp
last_execution_time = get_status(name)
if last_execution_time is False or (current_time - last_execution_time) > (15 * 24 * 60 * 60):
print(f"forgetting old backups for repo {r['name']}....")
run_command(f"restic forget {r['cleanup_parameters']}", env=e)
print(f"Pruning old backups for repo {r['name']}....")
run_command("restic prune --max-unused 10%", env=e)
set_status(name, current_time)