backup-manager/backupmanager.j2

#!/usr/bin/python3

#### Config
repos = []
{% for r in config.repos %}
_hosts=[]
{% for h in r.hosts %}
_hosts.append("{{ h }}")
{% endfor %}
repos.append({
    "name": "{{ r.name }}",
    "url": "{{ r.url }}",
    "key": "{{ r.key }}",
    "hosts": _hosts,
    "cleanup_parameters": "{{ r.cleanup_parameters | default(config.default_cleanup_parameters) }}",
    "check_repo": {% if r.check_repo %}True{% else %}False{% endif %},
})
{% endfor %}

{% macro to_python(d) %}{
{% for key, value in d.items() %}
"{{ key }}": {% if value is boolean %}{% if value %}True{% else %}False{% endif %}{% elif value is mapping %}{{ to_python(value) }}{% else %}{{ value | tojson }}{% endif %},
{% endfor %}
}
{% endmacro %}

hosts = {{ to_python(config.hosts) }}
#### End config

import subprocess
import os
import sys
import json
import datetime
import time

import fcntl

lockfd = open("{{ homedir }}/.backupmanager.lock", "w")
try:
    fcntl.flock(lockfd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
    print("Another instance is running.")
    sys.exit(1)

def run_command(command, env=None):
    current_env = os.environ.copy()
    if env is not None:
        current_env.update(env)
    sys.stdout.flush()
    sys.stderr.flush()
    process = subprocess.run(command, shell=True, env=current_env)
    sys.stdout.flush()
    sys.stderr.flush()
    if process.returncode != 0:
        raise RuntimeError(f"Command '{command}' failed with return code {process.returncode}, environment is {env}")

def get_command(command, env=None, full_return=False):
    current_env = os.environ.copy()
    if env is not None:
        current_env.update(env)
    sys.stdout.flush()
    sys.stderr.flush()
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=current_env)
    output, error = process.communicate()
    sys.stdout.flush()
    sys.stderr.flush()
    return_code = process.poll()
    if full_return:
        return output.decode().strip(), error.decode().strip(), return_code
    if error:
        print(error.decode().strip(), file=sys.stderr)
    if return_code != 0:
        raise RuntimeError(f"Command '{command}' failed with return code {return_code}, environment is {env}")
    return output.decode()

def get_snapshot_info(repo):
    e = {}
    e["RESTIC_REPOSITORY"]=repo["url"]
    e["RESTIC_PASSWORD"]=repo["key"]
    data = get_command(f"restic snapshots --json", env=e)
    return json.loads(data)


# Define the path to the JSON file
STATUS_FILE = "{{ homedir }}/status.json"

def get_status(name):
    if not os.path.exists(STATUS_FILE):
        return False
    with open(STATUS_FILE, 'r') as file:
        try:
            statuses = json.load(file)
        except json.JSONDecodeError:
            return False  # Return False if the file is empty or corrupted
    return statuses.get(name, False)

def set_status(name, value):
    statuses = {}
    if os.path.exists(STATUS_FILE):
        with open(STATUS_FILE, 'r') as file:
            try:
                statuses = json.load(file)
            except json.JSONDecodeError:
                pass  # Ignore errors and start with an empty dictionary
    statuses[name] = value
    with open(STATUS_FILE, 'w') as file:
        json.dump(statuses, file, indent=4)


# Check that we can open the repositories properly
repos_in_error=[]
repos_ok=[]
polynomial=""
polynomial_ok=True
for r in repos:
    print(f"Checking repo {r['name']}")
    output,error,rc = get_command("restic cat config --json",{"RESTIC_REPOSITORY": r['url'],"RESTIC_PASSWORD": r['key'] }, full_return=True)
    if rc!=0:
        repos_in_error.append(r)
    else:
        repoconfig=json.loads(output)
        if polynomial=="":
            polynomial=repoconfig["chunker_polynomial"]
        else:
            if polynomial!=repoconfig["chunker_polynomial"]:
                polynomial_ok=False
        repos_ok.append({"repo":r,"config":repoconfig})

if len(repos_ok)==0:
    print("None of the repositories can be accessed.  At least one must be reachable for me to output repository init commands")
    sys.exit(1)

if polynomial_ok==False:
    print("Not all repositories have the same chunker polynomial configured.  This can ONLY be configured when the repository is first created.  Please delete the repositories you can rebuild...")
    for r in repos_ok:
        print(f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}")
    sys.exit(1)

if len(repos_in_error)!=0:
    print("Could not open all repositories.  Check that they are accessible and that the passwords are correct.  If they are not yet initialized, use the following commands:")
    repo_from=repos_ok[0]['repo']
    for r in repos_in_error:
        print()
        print(f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}")
    sys.exit(1)


for r in repos:
    print(f"Getting snapshot list for repo {r['name']}")
    r['snapshots'] = get_snapshot_info(r)

# Verify that we have the correct backups stored
allhosts = set()
hostsok = True
for r in repos:
    wronghosts = set()
    allhosts.update(r['hosts'])
    for s in r['snapshots']:
        if s['hostname'] not in r['hosts']:
            wronghosts.update(s['hostname'])
    if wronghosts:
        print(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list")
        hostsok = False
if not hostsok:
    print("Host information not ok, aborting")
    sys.exit(1)

for host in allhosts:
    print(f"Syncing hostname {host}")
    most_recent_backup_ts = 0
    most_recent_backup_str = ""
    most_recent_backup_on = {}
    most_recent_backup_id = ""
    for r in repos:
        for s in r['snapshots']:
            if s['hostname']!=host:
                continue
            time_string = s["time"]
            # Python does not accept a : in the timezone, yet go's code places it.  Fix this...
            # time_string = time_string[:-6] + time_string[-6:].replace(":","")
            # Python only accepts 6 digits in the fractals for the seconds....
            frac_seconds_str = time_string.split('.')[-1].rstrip('Z')
            frac_seconds_len = len(frac_seconds_str)
            # Truncate or pad the fractional seconds string as needed
            if frac_seconds_len >= 4:
                frac_seconds_str = frac_seconds_str[:4]
            time_string = time_string[:-frac_seconds_len] + frac_seconds_str + 'Z'
            unix_time = datetime.datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp()

            if unix_time>most_recent_backup_ts:
                most_recent_backup_ts = unix_time
                most_recent_backup_str = s["time"]
                most_recent_backup_on = r
                most_recent_backup_id = s['id']
    if most_recent_backup_ts == 0:
        print(f"WARNING: There are no backups for {host}")
        continue

    # We now know the most recent backup.  See if a backup is present on all targets that carry this hostname.
    have_a_copy=False
    for r in repos:
        if host not in r['hosts']:
            continue
        if r['url']==most_recent_backup_on['url']:
            continue
        have_a_copy = True
        copy_exists = False
        for s in r['snapshots']:
            if s['time']==most_recent_backup_str:
                copy_exists = True
        if copy_exists:
            continue

        # Copy!
        print(f"Copying backup {most_recent_backup_id} from {most_recent_backup_on['name']} to {r['name']}")
        e = {}
        e["RESTIC_REPOSITORY"]=r["url"]
        e["RESTIC_PASSWORD"]=r["key"]
        e["RESTIC_FROM_REPOSITORY"]=most_recent_backup_on["url"]
        e["RESTIC_FROM_PASSWORD"]=most_recent_backup_on["key"]
        run_command(f"restic copy {most_recent_backup_id}", env=e)

    idle = False
    timeout = 10
    if host in hosts:
        idle = hosts[host]["idle"]
        timeout = hosts[host]["timeout"]
    if not idle:
        if not have_a_copy:
            print(f"WARNING: We do not have a copy for {host}")
        if most_recent_backup_ts < time.time()-(timeout*24*3600):
            print(f"WARNING: Last backup for {host} is too old")

for r in repos:
    e = {}
    e["RESTIC_REPOSITORY"]=r["url"]
    e["RESTIC_PASSWORD"]=r["key"]

    name = "repo_check_" + r["name"]
    name2 = "repo_check_ok_" + r["name"]
   
    if r["check_repo"]:
        current_time = int(time.time())  # Get the current Unix timestamp
        last_execution_time = get_status(name)

        # only check once every 12h.  If it is ok, we check 5%, so on average it takes 20 days
        # to read the entire repo if we are extreamly lucky with the random.  But scanning 100%
        # is very expensive, so we don't do it, unless we are not sure it is all ok.
        # So always scan 100% unless we know it is ok already from a previous scan.
        # if a scan fails, we reset the ok flag before scanning, so next time it will be back 100%
        if last_execution_time is False or (current_time - last_execution_time) > (12 * 60 * 60):
            repo_ok = get_status(name2)
            set_status(name2, False)
            if repo_ok:
                print(f"Checking random part of the backups for repo {r['name']}....")
                run_command(f"restic check --read-data-subset=5%", env=e)
            else:
                print(f"Checking full backup of the backups for repo {r['name']}....")
                run_command("restic check", env=e)
            set_status(name2, True)
            set_status(name, current_time)

for r in repos:
    e = {}
    e["RESTIC_REPOSITORY"]=r["url"]
    e["RESTIC_PASSWORD"]=r["key"]
    name = "repo_purge_" + r["name"]
   
    current_time = int(time.time())  # Get the current Unix timestamp
    last_execution_time = get_status(name)

    if last_execution_time is False or (current_time - last_execution_time) > (15 * 24 * 60 * 60):
        print(f"forgetting old backups for repo {r['name']}....")
        run_command(f"restic forget {r['cleanup_parameters']}", env=e)

        print(f"Pruning old backups for repo {r['name']}....")
        run_command("restic prune --max-unused 10%", env=e)

        set_status(name, current_time)
initial commit 2024-12-21 22:55:27 +01:00			`#!/usr/bin/python3`

			`#### Config`
			`repos = []`
			`{% for r in config.repos %}`
			`_hosts=[]`
			`{% for h in r.hosts %}`
			`_hosts.append("{{ h }}")`
			`{% endfor %}`
			`repos.append({`
			`"name": "{{ r.name }}",`
			`"url": "{{ r.url }}",`
			`"key": "{{ r.key }}",`
			`"hosts": _hosts,`
			`"cleanup_parameters": "{{ r.cleanup_parameters \| default(config.default_cleanup_parameters) }}",`
			`"check_repo": {% if r.check_repo %}True{% else %}False{% endif %},`
			`})`
			`{% endfor %}`

			`{% macro to_python(d) %}{`
			`{% for key, value in d.items() %}`
			`"{{ key }}": {% if value is boolean %}{% if value %}True{% else %}False{% endif %}{% elif value is mapping %}{{ to_python(value) }}{% else %}{{ value \| tojson }}{% endif %},`
			`{% endfor %}`
			`}`
			`{% endmacro %}`

			`hosts = {{ to_python(config.hosts) }}`
			`#### End config`

			`import subprocess`
			`import os`
			`import sys`
			`import json`
			`import datetime`
			`import time`

			`import fcntl`

			`lockfd = open("{{ homedir }}/.backupmanager.lock", "w")`
			`try:`
			`fcntl.flock(lockfd, fcntl.LOCK_EX \| fcntl.LOCK_NB)`
			`except BlockingIOError:`
			`print("Another instance is running.")`
			`sys.exit(1)`

			`def run_command(command, env=None):`
			`current_env = os.environ.copy()`
			`if env is not None:`
			`current_env.update(env)`
flush output This will ensure that the stderr and stdout are running synchroniously. The output in journalctl will now make more sense. 2024-12-29 16:49:00 +01:00			`sys.stdout.flush()`
			`sys.stderr.flush()`
initial commit 2024-12-21 22:55:27 +01:00			`process = subprocess.run(command, shell=True, env=current_env)`
flush output This will ensure that the stderr and stdout are running synchroniously. The output in journalctl will now make more sense. 2024-12-29 16:49:00 +01:00			`sys.stdout.flush()`
			`sys.stderr.flush()`
initial commit 2024-12-21 22:55:27 +01:00			`if process.returncode != 0:`
			`raise RuntimeError(f"Command '{command}' failed with return code {process.returncode}, environment is {env}")`

			`def get_command(command, env=None, full_return=False):`
			`current_env = os.environ.copy()`
			`if env is not None:`
			`current_env.update(env)`
flush output This will ensure that the stderr and stdout are running synchroniously. The output in journalctl will now make more sense. 2024-12-29 16:49:00 +01:00			`sys.stdout.flush()`
			`sys.stderr.flush()`
initial commit 2024-12-21 22:55:27 +01:00			`process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=current_env)`
			`output, error = process.communicate()`
flush output This will ensure that the stderr and stdout are running synchroniously. The output in journalctl will now make more sense. 2024-12-29 16:49:00 +01:00			`sys.stdout.flush()`
			`sys.stderr.flush()`
initial commit 2024-12-21 22:55:27 +01:00			`return_code = process.poll()`
			`if full_return:`
			`return output.decode().strip(), error.decode().strip(), return_code`
			`if error:`
			`print(error.decode().strip(), file=sys.stderr)`
			`if return_code != 0:`
			`raise RuntimeError(f"Command '{command}' failed with return code {return_code}, environment is {env}")`
			`return output.decode()`

			`def get_snapshot_info(repo):`
			`e = {}`
			`e["RESTIC_REPOSITORY"]=repo["url"]`
			`e["RESTIC_PASSWORD"]=repo["key"]`
			`data = get_command(f"restic snapshots --json", env=e)`
			`return json.loads(data)`


			`# Define the path to the JSON file`
			`STATUS_FILE = "{{ homedir }}/status.json"`

			`def get_status(name):`
			`if not os.path.exists(STATUS_FILE):`
			`return False`
			`with open(STATUS_FILE, 'r') as file:`
			`try:`
			`statuses = json.load(file)`
			`except json.JSONDecodeError:`
			`return False # Return False if the file is empty or corrupted`
			`return statuses.get(name, False)`

			`def set_status(name, value):`
			`statuses = {}`
			`if os.path.exists(STATUS_FILE):`
			`with open(STATUS_FILE, 'r') as file:`
			`try:`
			`statuses = json.load(file)`
			`except json.JSONDecodeError:`
			`pass # Ignore errors and start with an empty dictionary`
			`statuses[name] = value`
			`with open(STATUS_FILE, 'w') as file:`
			`json.dump(statuses, file, indent=4)`


			`# Check that we can open the repositories properly`
			`repos_in_error=[]`
			`repos_ok=[]`
			`polynomial=""`
			`polynomial_ok=True`
			`for r in repos:`
			`print(f"Checking repo {r['name']}")`
			`output,error,rc = get_command("restic cat config --json",{"RESTIC_REPOSITORY": r['url'],"RESTIC_PASSWORD": r['key'] }, full_return=True)`
			`if rc!=0:`
			`repos_in_error.append(r)`
			`else:`
			`repoconfig=json.loads(output)`
			`if polynomial=="":`
			`polynomial=repoconfig["chunker_polynomial"]`
			`else:`
			`if polynomial!=repoconfig["chunker_polynomial"]:`
			`polynomial_ok=False`
			`repos_ok.append({"repo":r,"config":repoconfig})`

			`if len(repos_ok)==0:`
			`print("None of the repositories can be accessed. At least one must be reachable for me to output repository init commands")`
			`sys.exit(1)`

			`if polynomial_ok==False:`
			`print("Not all repositories have the same chunker polynomial configured. This can ONLY be configured when the repository is first created. Please delete the repositories you can rebuild...")`
			`for r in repos_ok:`
			`print(f"Repo: {r['repo']['url']} polynomial: {r['config']['chunker_polynomial']}")`
			`sys.exit(1)`

			`if len(repos_in_error)!=0:`
			`print("Could not open all repositories. Check that they are accessible and that the passwords are correct. If they are not yet initialized, use the following commands:")`
			`repo_from=repos_ok[0]['repo']`
			`for r in repos_in_error:`
			`print()`
			`print(f"RESTIC_PASSWORD={r['key']} RESTIC_FROM_PASSWORD={repo_from['key']} restic init --copy-chunker-params=true --from-repo {repo_from['url']} -r {r['url']}")`
			`sys.exit(1)`


			`for r in repos:`
			`print(f"Getting snapshot list for repo {r['name']}")`
			`r['snapshots'] = get_snapshot_info(r)`

			`# Verify that we have the correct backups stored`
			`allhosts = set()`
			`hostsok = True`
			`for r in repos:`
			`wronghosts = set()`
			`allhosts.update(r['hosts'])`
			`for s in r['snapshots']:`
			`if s['hostname'] not in r['hosts']:`
			`wronghosts.update(s['hostname'])`
			`if wronghosts:`
			`print(f"The repository {r['name']} contains backups for unknown hosts {wronghosts}, either delete the backups, or add the hosts to the list")`
			`hostsok = False`
			`if not hostsok:`
			`print("Host information not ok, aborting")`
			`sys.exit(1)`

			`for host in allhosts:`
			`print(f"Syncing hostname {host}")`
			`most_recent_backup_ts = 0`
			`most_recent_backup_str = ""`
			`most_recent_backup_on = {}`
			`most_recent_backup_id = ""`
			`for r in repos:`
			`for s in r['snapshots']:`
			`if s['hostname']!=host:`
			`continue`
			`time_string = s["time"]`
			`# Python does not accept a : in the timezone, yet go's code places it. Fix this...`
			`# time_string = time_string[:-6] + time_string[-6:].replace(":","")`
			`# Python only accepts 6 digits in the fractals for the seconds....`
			`frac_seconds_str = time_string.split('.')[-1].rstrip('Z')`
			`frac_seconds_len = len(frac_seconds_str)`
			`# Truncate or pad the fractional seconds string as needed`
			`if frac_seconds_len >= 4:`
			`frac_seconds_str = frac_seconds_str[:4]`
			`time_string = time_string[:-frac_seconds_len] + frac_seconds_str + 'Z'`
			`unix_time = datetime.datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S.%f%z").timestamp()`

			`if unix_time>most_recent_backup_ts:`
			`most_recent_backup_ts = unix_time`
			`most_recent_backup_str = s["time"]`
			`most_recent_backup_on = r`
			`most_recent_backup_id = s['id']`
			`if most_recent_backup_ts == 0:`
			`print(f"WARNING: There are no backups for {host}")`
			`continue`

			`# We now know the most recent backup. See if a backup is present on all targets that carry this hostname.`
			`have_a_copy=False`
			`for r in repos:`
			`if host not in r['hosts']:`
			`continue`
			`if r['url']==most_recent_backup_on['url']:`
			`continue`
			`have_a_copy = True`
			`copy_exists = False`
			`for s in r['snapshots']:`
			`if s['time']==most_recent_backup_str:`
			`copy_exists = True`
			`if copy_exists:`
			`continue`

			`# Copy!`
			`print(f"Copying backup {most_recent_backup_id} from {most_recent_backup_on['name']} to {r['name']}")`
			`e = {}`
			`e["RESTIC_REPOSITORY"]=r["url"]`
			`e["RESTIC_PASSWORD"]=r["key"]`
			`e["RESTIC_FROM_REPOSITORY"]=most_recent_backup_on["url"]`
			`e["RESTIC_FROM_PASSWORD"]=most_recent_backup_on["key"]`
			`run_command(f"restic copy {most_recent_backup_id}", env=e)`

			`idle = False`
			`timeout = 10`
			`if host in hosts:`
			`idle = hosts[host]["idle"]`
			`timeout = hosts[host]["timeout"]`
			`if not idle:`
			`if not have_a_copy:`
			`print(f"WARNING: We do not have a copy for {host}")`
			`if most_recent_backup_ts < time.time()-(timeout243600):`
			`print(f"WARNING: Last backup for {host} is too old")`

			`for r in repos:`
			`e = {}`
			`e["RESTIC_REPOSITORY"]=r["url"]`
			`e["RESTIC_PASSWORD"]=r["key"]`

			`name = "repo_check_" + r["name"]`
			`name2 = "repo_check_ok_" + r["name"]`

			`if r["check_repo"]:`
			`current_time = int(time.time()) # Get the current Unix timestamp`
			`last_execution_time = get_status(name)`

			`# only check once every 12h. If it is ok, we check 5%, so on average it takes 20 days`
			`# to read the entire repo if we are extreamly lucky with the random. But scanning 100%`
			`# is very expensive, so we don't do it, unless we are not sure it is all ok.`
			`# So always scan 100% unless we know it is ok already from a previous scan.`
			`# if a scan fails, we reset the ok flag before scanning, so next time it will be back 100%`
			`if last_execution_time is False or (current_time - last_execution_time) > (12 * 60 * 60):`
			`repo_ok = get_status(name2)`
			`set_status(name2, False)`
			`if repo_ok:`
			`print(f"Checking random part of the backups for repo {r['name']}....")`
			`run_command(f"restic check --read-data-subset=5%", env=e)`
			`else:`
			`print(f"Checking full backup of the backups for repo {r['name']}....")`
			`run_command("restic check", env=e)`
			`set_status(name2, True)`
			`set_status(name, current_time)`

			`for r in repos:`
			`e = {}`
			`e["RESTIC_REPOSITORY"]=r["url"]`
			`e["RESTIC_PASSWORD"]=r["key"]`
			`name = "repo_purge_" + r["name"]`

			`current_time = int(time.time()) # Get the current Unix timestamp`
			`last_execution_time = get_status(name)`

			`if last_execution_time is False or (current_time - last_execution_time) > (15 * 24 * 60 * 60):`
			`print(f"forgetting old backups for repo {r['name']}....")`
			`run_command(f"restic forget {r['cleanup_parameters']}", env=e)`

			`print(f"Pruning old backups for repo {r['name']}....")`
			`run_command("restic prune --max-unused 10%", env=e)`

			`set_status(name, current_time)`