Now log everything in a file, but split it up into multiple files The -debug variant can contain sensitive data, but the regular file cannot. This way we can use the non-debug variant for the monitoring later on. Note that the idea is that those files are empty if the last run was successfull, and contain the error when something went wrong.
295 lines
11 KiB
Django/Jinja
295 lines
11 KiB
Django/Jinja
#!/usr/bin/python3
|
|
|
|
#### Config
|
|
repos = []
|
|
{% for r in config.repos %}
|
|
_hosts=[]
|
|
{% for h in r.hosts %}
|
|
_hosts.append("{{ h }}")
|
|
{% endfor %}
|
|
repos.append({
|
|
"name": "{{ r.name }}",
|
|
"url": "{{ r.url }}",
|
|
"key": "{{ r.key }}",
|
|
"hosts": _hosts,
|
|
"cleanup_parameters": "{{ r.cleanup_parameters | default(config.default_cleanup_parameters) }}",
|
|
"check_repo": {% if r.check_repo %}True{% else %}False{% endif %},
|
|
})
|
|
{% endfor %}
|
|
|
|
{% macro to_python(d) %}{
|
|
{% for key, value in d.items() %}
|
|
"{{ key }}": {% if value is boolean %}{% if value %}True{% else %}False{% endif %}{% elif value is mapping %}{{ to_python(value) }}{% else %}{{ value | tojson }}{% endif %},
|
|
{% endfor %}
|
|
}
|
|
{% endmacro %}
|
|
|
|
hosts = {{ to_python(config.hosts) }}
|
|
#### End config
|
|
|
|
import subprocess
|
|
import os
|
|
import sys
|
|
import json
|
|
import datetime
|
|
import time
|
|
|
|
import fcntl
|
|
|
|
lockfd = open("{{ homedir }}/.backupmanager.lock", "w")
|
|
try:
|
|
fcntl.flock(lockfd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
except BlockingIOError:
|
|
print("Another instance is running.")
|
|
sys.exit(1)
|
|
|
|
# This file contains all problems with the last run. Empty when we ran succesfully,
|
|
# otherwise what needs attention...
|
|
logfd = open("{{ homedir }}/.backupmanager.errors", "w")
|
|
logfd2 = open("{{ homedir }}/.backupmanager.errors-debug", "w")
|
|
|
|
def output_warning(msg, sensitive=""):
|
|
print(f"WARN: {msg} {sensitive}")
|
|
logfd.write(f"WARN: {msg}\n")
|
|
logfd2.write(f"WARN: {msg} {sensitive}\n")
|
|
|
|
def output_fatal(msg, sensitive="", dontquityet=False):
|
|
print(f"FATAL: {msg} {sensitive}")
|
|
logfd.write(f"FATAL: {msg}\n")
|
|
logfd2.write(f"FATAL: {msg} {sensitive}\n")
|
|
if not dontquityet:
|
|
sys.exit(1)
|
|
|
|
def run_command(command, env=None):
|
|
current_env = os.environ.copy()
|
|
if env is not None:
|
|
current_env.update(env)
|
|
sys.stdout.flush()
|
|
sys.stderr.flush()
|
|
process = subprocess.run(command, shell=True, env=current_env)
|
|
sys.stdout.flush()
|
|
sys.stderr.flush()
|
|
if process.returncode != 0:
|
|
output_fatal(f"Command '{command}' failed with return code {process.returncode}", sensitive=f"environment is {env}")
|
|
|
|
def get_command(command, env=None, full_return=False):
|
|
current_env = os.environ.copy()
|
|
if env is not None:
|
|
current_env.update(env)
|
|
sys.stdout.flush()
|
|
sys.stderr.flush()
|
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=current_env)
|
|
output, error = process.communicate()
|
|
sys.stdout.flush()
|
|
sys.stderr.flush()
|
|
return_code = process.poll()
|
|
if full_return:
|
|
return output.decode().strip(), error.decode().strip(), return_code
|
|
if error:
|
|
print(error.decode().strip(), file=sys.stderr)
|
|
if return_code != 0:
|
|
output_fatal(f"Command '{command}' failed with return code {return_code}", sensitive=f"environment is {env}")
|
|
return output.decode()
|
|
|
|
def get_snapshot_info(repo):
|
|
e = {}
|
|
e["RESTIC_REPOSITORY"]=repo["url"]
|
|
e["RESTIC_PASSWORD"]=repo["key"]
|
|
data = get_command(f"restic snapshots --json", env=e)
|
|
return json.loads(data)
|
|
|
|
|
|
# Define the path to the JSON file
|
|
STATUS_FILE = "{{ homedir }}/status.json"
|
|
|
|
def get_status(name):
|
|
if not os.path.exists(STATUS_FILE):
|
|
return False
|
|
with open(STATUS_FILE, 'r') as file:
|
|
try:
|
|
statuses = json.load(file)
|
|
except json.JSONDecodeError:
|
|
return False # Return False if the file is empty or corrupted
|
|
return statuses.get(name, False)
|
|
|
|
def set_status(name, value):
|
|
statuses = {}
|
|
if os.path.exists(STATUS_FILE):
|
|
with open(STATUS_FILE, 'r') as file:
|
|
try:
|
|
statuses = json.load(file)
|
|
except json.JSONDecodeError:
|
|
pass # Ignore errors and start with an empty dictionary
|
|
statuses[name] = value
|
|
with open(STATUS_FILE, 'w') as file:
|
|
json.dump(statuses, file, indent=4)
|
|
|
|
|
|
# Check that we can open the repositories properly
|
|
repos_in_error=[]
|
|
repos_ok=[]
|
|
polynomial=""
|
|
polynomial_ok=True
|
|
for r in repos:
|
|
print(f"Checking repo {r['name']}")
|
|
output,error,rc = get_command("restic cat config --json",{"RESTIC_REPOSITORY": r['url'],"RESTIC_PASSWORD": r['key'] }, full_return=True)
|
|
if rc!=0:
|
|
repos_in_error.append(r)
|
|
else:
|
|
repoconfig=json.loads(output)
|
|
if polynomial=="":
|
|
polynomial=repoconfig["chunker_polynomial"]
|
|
else:
|
|
if polynomial!=repoconfig["chunker_polynomial"]:
|
|
polynomial_ok=False
|
|
repos_ok.append({"repo":r,"config":repoconfig})
|
|
|
|
if len(repos_ok)==0:
|
|
output_fatal("None of the repositories can be accessed. At least one must be reachable for me to output repository init commands")
|
|
|
|
if polynomial_ok==False:
|
|
msg = "Not all repositories have the same chunker polynomial configured. This can ONLY be configured when the repository is first created. Please delete the repositories you can rebuild...\n"
|
|
s = ""
|
|
for r in repos_ok:
|
|
s = s + f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}\n"
|
|
output_fatal(msg, sensitive=s)
|
|
|
|
if len(repos_in_error)!=0:
|
|
msg = "Could not open all repositories. Check that they are accessible and that the passwords are correct."
|
|
s = "If they are not yet initialized, use the following commands:\n\n"
|
|
repo_from=repos_ok[0]['repo']
|
|
for r in repos_in_error:
|
|
s = s + f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}\n"
|
|
output_fatal(msg, sensitive=s)
|
|
|
|
for r in repos:
|
|
print(f"Getting snapshot list for repo {r['name']}")
|
|
r['snapshots'] = get_snapshot_info(r)
|
|
|
|
# Verify that we have the correct backups stored
|
|
allhosts = set()
|
|
hostsok = True
|
|
for r in repos:
|
|
wronghosts = set()
|
|
allhosts.update(r['hosts'])
|
|
for s in r['snapshots']:
|
|
if s['hostname'] not in r['hosts']:
|
|
wronghosts.update(s['hostname'])
|
|
if wronghosts:
|
|
output_fatal(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list", dontquityet=True)
|
|
hostsok = False
|
|
if not hostsok:
|
|
output_fatal("Host information not ok, aborting")
|
|
|
|
for host in allhosts:
|
|
print(f"Syncing hostname {host}")
|
|
most_recent_backup_ts = 0
|
|
most_recent_backup_str = ""
|
|
most_recent_backup_on = {}
|
|
most_recent_backup_id = ""
|
|
for r in repos:
|
|
for s in r['snapshots']:
|
|
if s['hostname']!=host:
|
|
continue
|
|
time_string = s["time"]
|
|
# Python does not accept a : in the timezone, yet go's code places it. Fix this...
|
|
# time_string = time_string[:-6] + time_string[-6:].replace(":","")
|
|
# Python only accepts 6 digits in the fractals for the seconds....
|
|
frac_seconds_str = time_string.split('.')[-1].rstrip('Z')
|
|
frac_seconds_len = len(frac_seconds_str)
|
|
# Truncate or pad the fractional seconds string as needed
|
|
if frac_seconds_len >= 4:
|
|
frac_seconds_str = frac_seconds_str[:4]
|
|
time_string = time_string[:-frac_seconds_len] + frac_seconds_str + 'Z'
|
|
unix_time = datetime.datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp()
|
|
|
|
if unix_time>most_recent_backup_ts:
|
|
most_recent_backup_ts = unix_time
|
|
most_recent_backup_str = s["time"]
|
|
most_recent_backup_on = r
|
|
most_recent_backup_id = s['id']
|
|
if most_recent_backup_ts == 0:
|
|
output_warning(f"There are no backups for {host}")
|
|
continue
|
|
|
|
# We now know the most recent backup. See if a backup is present on all targets that carry this hostname.
|
|
have_a_copy=False
|
|
for r in repos:
|
|
if host not in r['hosts']:
|
|
continue
|
|
if r['url']==most_recent_backup_on['url']:
|
|
continue
|
|
have_a_copy = True
|
|
copy_exists = False
|
|
for s in r['snapshots']:
|
|
if s['time']==most_recent_backup_str:
|
|
copy_exists = True
|
|
if copy_exists:
|
|
continue
|
|
|
|
# Copy!
|
|
print(f"Copying backup {most_recent_backup_id} from {most_recent_backup_on['name']} to {r['name']}")
|
|
e = {}
|
|
e["RESTIC_REPOSITORY"]=r["url"]
|
|
e["RESTIC_PASSWORD"]=r["key"]
|
|
e["RESTIC_FROM_REPOSITORY"]=most_recent_backup_on["url"]
|
|
e["RESTIC_FROM_PASSWORD"]=most_recent_backup_on["key"]
|
|
run_command(f"restic copy {most_recent_backup_id}", env=e)
|
|
|
|
idle = False
|
|
timeout = 10
|
|
if host in hosts:
|
|
idle = hosts[host]["idle"]
|
|
timeout = hosts[host]["timeout"]
|
|
if not idle:
|
|
if not have_a_copy:
|
|
output_warning(f"We do not have a copy for {host}")
|
|
if most_recent_backup_ts < time.time()-(timeout*24*3600):
|
|
output_warning(f"Last backup for {host} is too old")
|
|
|
|
for r in repos:
|
|
e = {}
|
|
e["RESTIC_REPOSITORY"]=r["url"]
|
|
e["RESTIC_PASSWORD"]=r["key"]
|
|
|
|
name = "repo_check_" + r["name"]
|
|
name2 = "repo_check_ok_" + r["name"]
|
|
|
|
if r["check_repo"]:
|
|
current_time = int(time.time()) # Get the current Unix timestamp
|
|
last_execution_time = get_status(name)
|
|
|
|
# only check once every 12h. If it is ok, we check 5%, so on average it takes 20 days
|
|
# to read the entire repo if we are extreamly lucky with the random. But scanning 100%
|
|
# is very expensive, so we don't do it, unless we are not sure it is all ok.
|
|
# So always scan 100% unless we know it is ok already from a previous scan.
|
|
# if a scan fails, we reset the ok flag before scanning, so next time it will be back 100%
|
|
if last_execution_time is False or (current_time - last_execution_time) > (12 * 60 * 60):
|
|
repo_ok = get_status(name2)
|
|
set_status(name2, False)
|
|
if repo_ok:
|
|
print(f"Checking random part of the backups for repo {r['name']}....")
|
|
run_command(f"restic check --read-data-subset=5%", env=e)
|
|
else:
|
|
print(f"Checking full backup of the backups for repo {r['name']}....")
|
|
run_command("restic check", env=e)
|
|
set_status(name2, True)
|
|
set_status(name, current_time)
|
|
|
|
for r in repos:
|
|
e = {}
|
|
e["RESTIC_REPOSITORY"]=r["url"]
|
|
e["RESTIC_PASSWORD"]=r["key"]
|
|
name = "repo_purge_" + r["name"]
|
|
|
|
current_time = int(time.time()) # Get the current Unix timestamp
|
|
last_execution_time = get_status(name)
|
|
|
|
if last_execution_time is False or (current_time - last_execution_time) > (15 * 24 * 60 * 60):
|
|
print(f"forgetting old backups for repo {r['name']}....")
|
|
run_command(f"restic forget {r['cleanup_parameters']}", env=e)
|
|
|
|
print(f"Pruning old backups for repo {r['name']}....")
|
|
run_command("restic prune --max-unused 10%", env=e)
|
|
|
|
set_status(name, current_time)
|