From f83468d45bdf64f7aa0b28821ec3ef1bda0736f6 Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Thu, 17 Oct 2024 16:17:58 +0200 Subject: [PATCH 01/12] Add an initial implementation of an internal cfbs command to generate MPF release information Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/commands.py | 6 + cfbs/main.py | 3 + cfbs/utils.py | 20 +++ masterfiles/__init__.py | 0 masterfiles/analyze.py | 54 +++++++ masterfiles/check_download_matches_git.py | 20 +++ masterfiles/check_tarball_checksums.py | 34 +++++ masterfiles/download_all_versions.py | 159 ++++++++++++++++++++ masterfiles/generate_release_information.py | 33 ++++ masterfiles/generate_vcf_download.py | 21 +++ masterfiles/generate_vcf_git_checkout.py | 84 +++++++++++ 11 files changed, 434 insertions(+) create mode 100644 masterfiles/__init__.py create mode 100644 masterfiles/analyze.py create mode 100644 masterfiles/check_download_matches_git.py create mode 100644 masterfiles/check_tarball_checksums.py create mode 100644 masterfiles/download_all_versions.py create mode 100644 masterfiles/generate_release_information.py create mode 100644 masterfiles/generate_vcf_download.py create mode 100644 masterfiles/generate_vcf_git_checkout.py diff --git a/cfbs/commands.py b/cfbs/commands.py index 11d8d461..b709a706 100644 --- a/cfbs/commands.py +++ b/cfbs/commands.py @@ -65,6 +65,7 @@ from cfbs.git_magic import Result, commit_after_command, git_commit_maybe_prompt from cfbs.prompts import YES_NO_CHOICES, prompt_user from cfbs.module import Module, is_module_added_manually +from masterfiles.generate_release_information import generate_release_information class InputDataUpdateFailed(Exception): @@ -1204,3 +1205,8 @@ def get_input_command(name, outfile): log.error("Failed to write json: %s" % e) return 1 return 0 + + +@cfbs_command("generate-release-information") +def generate_release_information_command(): + generate_release_information() diff --git a/cfbs/main.py b/cfbs/main.py index ca5d1e3d..3d36bdde 100644 --- a/cfbs/main.py +++ b/cfbs/main.py @@ -91,6 +91,9 @@ def main() -> int: if args.command in ("info", "show"): return commands.info_command(args.args) + if args.command == "generate-release-information": + return commands.generate_release_information_command() + if not is_cfbs_repo(): user_error("This is not a cfbs repo, to get started, type: cfbs init") diff --git a/cfbs/utils.py b/cfbs/utils.py index 74b329c0..83b3e97b 100644 --- a/cfbs/utils.py +++ b/cfbs/utils.py @@ -244,6 +244,14 @@ def is_cfbs_repo() -> bool: return os.path.isfile(cfbs_filename()) +def immediate_subdirectories(path): + return [f.name for f in os.scandir(path) if f.is_dir()] + + +def immediate_files(path): + return [f.name for f in os.scandir(path) if not f.is_dir()] + + def path_append(dir, subdir): dir = os.path.abspath(os.path.expanduser(dir)) return dir if not subdir else os.path.join(dir, subdir) @@ -278,6 +286,18 @@ def cfbs_dir(append=None) -> str: return os.path.join(directory, append) +def string_sha256(input): + return hashlib.sha256(input.encode("utf-8")).hexdigest() + + +def file_sha256(file): + h = hashlib.sha256() + + h.update(open(file, "rb").read()) + + return h.hexdigest() + + class FetchError(Exception): pass diff --git a/masterfiles/__init__.py b/masterfiles/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/masterfiles/analyze.py b/masterfiles/analyze.py new file mode 100644 index 00000000..2f09057f --- /dev/null +++ b/masterfiles/analyze.py @@ -0,0 +1,54 @@ +# TODO merge this with ENT-12099 branch cfbs analyze.py +import os + +from cfbs.utils import file_sha256 + +IGNORED_PATH_COMPONENTS = [".git/", ".gitignore", ".gitattributes"] +# ignore a path iff it contains a component (single file or directory) from this list +# an element of this list should be just one component +# folders should end with '/', files should not +# TODO + + +def initialize_vcf(): + versions_dict = {"versions": {}} + checksums_dict = {"checksums": {}} + files_dict = {"files": {}} + + return versions_dict, checksums_dict, files_dict + + +def versions_checksums_files( + files_dir_path, version, versions_dict, checksums_dict, files_dict +): + for root, dirs, files in os.walk(files_dir_path): + for name in files: + full_relpath = os.path.join(root, name) + tarball_relpath = os.path.relpath(full_relpath, files_dir_path) + file_checksum = file_sha256(full_relpath) + + if version not in versions_dict["versions"]: + versions_dict["versions"][version] = {} + if "files" not in versions_dict["versions"][version]: + versions_dict["versions"][version]["files"] = {} + versions_dict["versions"][version]["files"][tarball_relpath] = file_checksum + + if not file_checksum in checksums_dict["checksums"]: + checksums_dict["checksums"][file_checksum] = [] + checksums_dict["checksums"][file_checksum].append( + { + "file": tarball_relpath, + "version": version, + } + ) + + if not tarball_relpath in files_dict["files"]: + files_dict["files"][tarball_relpath] = [] + files_dict["files"][tarball_relpath].append( + { + "checksum": file_checksum, + "version": version, + } + ) + + return versions_dict, checksums_dict, files_dict diff --git a/masterfiles/check_download_matches_git.py b/masterfiles/check_download_matches_git.py new file mode 100644 index 00000000..924f040f --- /dev/null +++ b/masterfiles/check_download_matches_git.py @@ -0,0 +1,20 @@ +# check that the downloadable files match the git files, mitigating a build system supply-chain attack +import os +import dictdiffer + +from cfbs.utils import read_json + + +def check_download_matches_git(versions): + download_versions_dict = read_json("versions.json") + git_versions_dict = read_json("versions-git.json") + + os.makedirs("differences", exist_ok=True) + + for version in versions: + download_version_dict = download_versions_dict["versions"][version]["files"] + git_version_dict = git_versions_dict["versions"][version]["files"] + + for diff in list(dictdiffer.diff(download_version_dict, git_version_dict)): + with open("differences/difference-" + version + ".txt", "w") as f: + print(diff, file=f) diff --git a/masterfiles/check_tarball_checksums.py b/masterfiles/check_tarball_checksums.py new file mode 100644 index 00000000..d7e615f6 --- /dev/null +++ b/masterfiles/check_tarball_checksums.py @@ -0,0 +1,34 @@ +from cfbs.utils import file_sha256, immediate_files + + +def check_tarball_checksums(dir_path, downloaded_versions, reported_checksums): + does_match = True + + for version in downloaded_versions: + print(version) + + version_path = dir_path / version + + versions_files = immediate_files(version_path) + # the tarball should be the only file in the version's directory + tarball_name = versions_files[0] + + tarball_path = version_path / tarball_name + + tarball_checksum = file_sha256(tarball_path) + + if version in ("3.10.0", "3.9.2"): + # 3.10.0 lists a .tar.gz, not a .pkg.tar.gz + # 3.9.2 lists no masterfiles + continue + + reported_checksum = reported_checksums[version] + + if tarball_checksum != reported_checksum: + does_match = False + print("* checksum difference:") + print(version) + print(tarball_checksum) + print(reported_checksum) + + return does_match diff --git a/masterfiles/download_all_versions.py b/masterfiles/download_all_versions.py new file mode 100644 index 00000000..c1413e8d --- /dev/null +++ b/masterfiles/download_all_versions.py @@ -0,0 +1,159 @@ +from pathlib import Path +from requests_cache import CachedSession +from shutil import unpack_archive +from urllib.request import urlretrieve + +DOWNLOAD = True +DEBUG = False + +ENTERPRISE_URL = "https://cfengine.com/release-data/enterprise/releases.json" +COMMUNITY_URL = "https://cfengine.com/release-data/community/releases.json" + + +def print_debug(*args, **kwargs): + if DEBUG: + print(*args, **kwargs) + + +def check_url_downloadable(session, url): + headers = session.head(url).headers + downloadable = "attachment" in headers.get("Content-Disposition", "") + + content_type = headers.get("content-type") + if "xml" in content_type.lower(): + downloadable = False + elif "gzip" in content_type.lower(): + downloadable = True + + return downloadable + + +def check_analogous_urls(session, version): + url_tarballs = ( + "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-" + + version + + ".pkg.tar.gz" + ) + + url_downloadable = check_url_downloadable(session, url_tarballs) + print_debug("Checking tarballs URL: ", url_downloadable) + print_debug(url_tarballs) + if url_downloadable: + return url_tarballs + + url_enterprise = ( + "https://cfengine-package-repos.s3.amazonaws.com/enterprise/Enterprise-" + + version + + "/misc/cfengine-masterfiles-" + + version + ) + + url_enterprise_0 = url_enterprise + ".pkg.tar.gz" + url_enterprise_1 = url_enterprise + "-1.pkg.tar.gz" + url_enterprise_2 = url_enterprise + "-2.pkg.tar.gz" + url_enterprise_3 = url_enterprise + "-3.pkg.tar.gz" + + print_debug( + "Checking enterprise-0 URL: ", check_url_downloadable(session, url_enterprise_0) + ) + print_debug( + "Checking enterprise-1 URL: ", check_url_downloadable(session, url_enterprise_1) + ) + print_debug( + "Checking enterprise-2 URL: ", check_url_downloadable(session, url_enterprise_2) + ) + print_debug( + "Checking enterprise-3 URL: ", check_url_downloadable(session, url_enterprise_3) + ) + + return None + + +# TODO +# def download_all_versions_community(): +# response = session.get(COMMUNITY_URL) +# # "masterfiles is at a different index" in 3.10.1 happens only for Enterprise, not Community + + +def download_all_versions_enterprise(): + session = CachedSession() + response = session.get(ENTERPRISE_URL) + data = response.json() + + urls_dict = {} + reported_checksums = {} + + for dd in data["releases"]: + version = dd["version"] + print_debug(version) + release_url = dd["URL"] + print_debug(release_url) + + subresponse = session.get(release_url) + subdata = subresponse.json() + + subdd = subdata["artifacts"] + if "Additional Assets" not in subdd: + print_debug("Warning: no Additional Assets!") + # happens for 3.9.0b1, 3.8.0b1, 3.6.1, 3.6.0 + if DEBUG: + check_analogous_urls(session, version) + + download_url = None + + else: + # for 3.10.0, for some reason, the masterfiles download link points to the .tar.gz tarball, rather than the .pkg.tar.gz tarball + # here, download the .pkg.tar.gz from a hidden analoguous URL instead + if version == "3.10.0": + download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.10.0.pkg.tar.gz" + else: + # there's precisely one version (3.10.1) for which masterfiles is at a different index + if version == "3.10.1": + subdd = subdd["Additional Assets"][1] + else: + subdd = subdd["Additional Assets"][0] + + if subdd["Title"] != "Masterfiles ready-to-install tarball": + print_debug("Warning: not masterfiles!") + # happens for 3.10.1, 3.9.2, 3.9.0, 3.8.2, 3.8.1, 3.8.0, 3.6.2--3.7.4 + if DEBUG: + check_analogous_urls(session, version) + # 3.10.1: see above + # 3.9.2: no masterfiles listed, but an analogous hidden URL exists + # 3.9.0 and others: no masterfiles listed, and an analogous hidden URLs seemingly do not exist + if version == "3.9.2": + download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.9.2.pkg.tar.gz" + else: + download_url = None + else: + download_url = subdd["URL"] + reported_checksums[version] = subdd["SHA256"] + + print_debug(download_url) + if download_url is not None: + urls_dict[version] = download_url + + downloaded_versions = [] + if DOWNLOAD: + root_path = Path("./enterprise") + Path.mkdir(root_path, exist_ok=True) + + for version, url in urls_dict.items(): + # ignore master and .x versions + if url.startswith("http://buildcache"): + continue + + downloaded_versions.append(version) + print(url) + + version_path = root_path / version + Path.mkdir(version_path, exist_ok=True) + + filename = url.split("/")[-1] + tarball_path = version_path / filename + urlretrieve(url, tarball_path) + + unpack_archive(tarball_path, version_path / "tarball") + + # for local verification of the reported (Enterprise) (.pkg.tar.gz) checksums + return downloaded_versions, reported_checksums diff --git a/masterfiles/generate_release_information.py b/masterfiles/generate_release_information.py new file mode 100644 index 00000000..9df5f7e7 --- /dev/null +++ b/masterfiles/generate_release_information.py @@ -0,0 +1,33 @@ +# TODO document `cfbs generate-release-information` +# this command uses several extra deps compared to the rest of cfbs +import sys +from pathlib import Path + +from masterfiles.download_all_versions import download_all_versions_enterprise +from masterfiles.check_tarball_checksums import check_tarball_checksums +from masterfiles.generate_vcf_download import generate_vcf_download +from masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout +from masterfiles.check_download_matches_git import check_download_matches_git + +ENTERPRISE_PATH = Path("./enterprise") + + +def generate_release_information(): + # only needs to be done once (although changes could happen afterwards), and silly to do if already have access to hosted files + downloaded_versions, reported_checksums = download_all_versions_enterprise() + # downloaded_versions, reported_checksums = download_all_versions_community() + + # Enterprise 3.9.2 is downloaded but there is no reported checksum, so both args are necessary + if check_tarball_checksums( + ENTERPRISE_PATH, downloaded_versions, reported_checksums + ): + print("Every checksum matches") + else: + print("Checksums differ!") + sys.exit(1) + + generate_vcf_download(ENTERPRISE_PATH, downloaded_versions) + generate_vcf_git_checkout(downloaded_versions) + + check_download_matches_git(downloaded_versions) + # TODO automatic analysis of the difference-*.txts diff --git a/masterfiles/generate_vcf_download.py b/masterfiles/generate_vcf_download.py new file mode 100644 index 00000000..1dd3aa0f --- /dev/null +++ b/masterfiles/generate_vcf_download.py @@ -0,0 +1,21 @@ +from cfbs.utils import write_json +from masterfiles.analyze import initialize_vcf, versions_checksums_files + + +def generate_vcf_download(dir_path, downloaded_versions): + """`dir_path`: the path of the directory containing masterfiles versions subdirectories in the form `dir_path/x.y.z/tarball/` + + The `tarball` folder should contain the `masterfiles` folder (older tarballs also have a `modules` folder alongside the `masterfiles` folder). + """ + versions_dict, checksums_dict, files_dict = initialize_vcf() + + for version in downloaded_versions: + files_dir_path = dir_path / version / "tarball" + + versions_dict, checksums_dict, files_dict = versions_checksums_files( + files_dir_path, version, versions_dict, checksums_dict, files_dict + ) + + write_json("versions.json", versions_dict) + write_json("checksums.json", checksums_dict) + write_json("files.json", files_dict) diff --git a/masterfiles/generate_vcf_git_checkout.py b/masterfiles/generate_vcf_git_checkout.py new file mode 100644 index 00000000..25f398c8 --- /dev/null +++ b/masterfiles/generate_vcf_git_checkout.py @@ -0,0 +1,84 @@ +import os +import shutil +import subprocess +import sys + +from cfbs.git import git_exists +from cfbs.utils import write_json +from masterfiles.analyze import initialize_vcf, versions_checksums_files + +DIR_PATH = "." +"""The path of the working directory.""" + +MPF_URL = "https://github.com/cfengine/masterfiles" +MPF_PATH = os.path.join(DIR_PATH, "masterfiles") + + +def generate_vcf_git_checkout(interesting_tags=None): + # clone the MPF repo every time the script is run, in case there are updates + if os.path.isdir(MPF_PATH): + shutil.rmtree(MPF_PATH) + + subprocess.run( + ["git", "clone", MPF_URL], + cwd=DIR_PATH, + check=True, + ) + + if not git_exists(): + print("`git` was not found") + sys.exit(1) + + result = subprocess.run( + ["git", "tag"], cwd=MPF_PATH, capture_output=True, check=True + ) + tags = result.stdout.decode("UTF-8").splitlines() + + # if not given, choose tags to checkout - by default, only consider version releases + if interesting_tags is None: + interesting_tags = [] + + for tag in tags: + if "-" not in tag: + interesting_tags.append(tag) + + versions_dict, checksums_dict, files_dict = initialize_vcf() + + for tag in interesting_tags: + print(tag) + + # checkout the version tag + subprocess.run( + ["git", "checkout", "--force", tag], + cwd=MPF_PATH, + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + # a clean is necessary to remove all the undesired files + subprocess.run( + ["git", "clean", "-dffx"], + cwd=MPF_PATH, + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + # build masterfiles from git as they are in the tarball packages + subprocess.run( + ["./autogen.sh"], + cwd=MPF_PATH, + check=True, + env=dict(os.environ.copy(), EXPLICIT_VERSION=tag), + ) + # older masterfiles version READMEs instruct to use `make install` and newer `make` - always use `make` instead + subprocess.run(["make"], cwd=MPF_PATH, check=True) + + # compute VCF data for all the files + versions_dict, checksums_dict, files_dict = versions_checksums_files( + MPF_PATH, tag, versions_dict, checksums_dict, files_dict + ) + + write_json("versions-git.json", versions_dict) + write_json("checksums-git.json", checksums_dict) + write_json("files-git.json", files_dict) From 676229916e6520bd1c8394badc97715bbe3109a8 Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Tue, 22 Oct 2024 19:31:01 +0200 Subject: [PATCH 02/12] Remove extra dependencies; clean up code Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- masterfiles/download_all_versions.py | 113 ++++---------------- masterfiles/generate_release_information.py | 13 ++- 2 files changed, 28 insertions(+), 98 deletions(-) diff --git a/masterfiles/download_all_versions.py b/masterfiles/download_all_versions.py index c1413e8d..58b9f0cc 100644 --- a/masterfiles/download_all_versions.py +++ b/masterfiles/download_all_versions.py @@ -1,104 +1,35 @@ from pathlib import Path -from requests_cache import CachedSession -from shutil import unpack_archive -from urllib.request import urlretrieve +import shutil +import urllib.request + +from cfbs.utils import get_json DOWNLOAD = True -DEBUG = False ENTERPRISE_URL = "https://cfengine.com/release-data/enterprise/releases.json" COMMUNITY_URL = "https://cfengine.com/release-data/community/releases.json" - -def print_debug(*args, **kwargs): - if DEBUG: - print(*args, **kwargs) - - -def check_url_downloadable(session, url): - headers = session.head(url).headers - downloadable = "attachment" in headers.get("Content-Disposition", "") - - content_type = headers.get("content-type") - if "xml" in content_type.lower(): - downloadable = False - elif "gzip" in content_type.lower(): - downloadable = True - - return downloadable - - -def check_analogous_urls(session, version): - url_tarballs = ( - "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-" - + version - + ".pkg.tar.gz" - ) - - url_downloadable = check_url_downloadable(session, url_tarballs) - print_debug("Checking tarballs URL: ", url_downloadable) - print_debug(url_tarballs) - if url_downloadable: - return url_tarballs - - url_enterprise = ( - "https://cfengine-package-repos.s3.amazonaws.com/enterprise/Enterprise-" - + version - + "/misc/cfengine-masterfiles-" - + version - ) - - url_enterprise_0 = url_enterprise + ".pkg.tar.gz" - url_enterprise_1 = url_enterprise + "-1.pkg.tar.gz" - url_enterprise_2 = url_enterprise + "-2.pkg.tar.gz" - url_enterprise_3 = url_enterprise + "-3.pkg.tar.gz" - - print_debug( - "Checking enterprise-0 URL: ", check_url_downloadable(session, url_enterprise_0) - ) - print_debug( - "Checking enterprise-1 URL: ", check_url_downloadable(session, url_enterprise_1) - ) - print_debug( - "Checking enterprise-2 URL: ", check_url_downloadable(session, url_enterprise_2) - ) - print_debug( - "Checking enterprise-3 URL: ", check_url_downloadable(session, url_enterprise_3) - ) - - return None - - # TODO # def download_all_versions_community(): -# response = session.get(COMMUNITY_URL) +# data = get_json(COMMUNITY_URL) # # "masterfiles is at a different index" in 3.10.1 happens only for Enterprise, not Community def download_all_versions_enterprise(): - session = CachedSession() - response = session.get(ENTERPRISE_URL) - data = response.json() + data = get_json(ENTERPRISE_URL) urls_dict = {} reported_checksums = {} - for dd in data["releases"]: - version = dd["version"] - print_debug(version) - release_url = dd["URL"] - print_debug(release_url) + for releases_data in data["releases"]: + version = releases_data["version"] + release_url = releases_data["URL"] - subresponse = session.get(release_url) - subdata = subresponse.json() + subdata = get_json(release_url) - subdd = subdata["artifacts"] - if "Additional Assets" not in subdd: - print_debug("Warning: no Additional Assets!") + artifacts_data = subdata["artifacts"] + if "Additional Assets" not in artifacts_data: # happens for 3.9.0b1, 3.8.0b1, 3.6.1, 3.6.0 - if DEBUG: - check_analogous_urls(session, version) - download_url = None else: @@ -109,27 +40,23 @@ def download_all_versions_enterprise(): else: # there's precisely one version (3.10.1) for which masterfiles is at a different index if version == "3.10.1": - subdd = subdd["Additional Assets"][1] + artifacts_data = artifacts_data["Additional Assets"][1] else: - subdd = subdd["Additional Assets"][0] + artifacts_data = artifacts_data["Additional Assets"][0] - if subdd["Title"] != "Masterfiles ready-to-install tarball": - print_debug("Warning: not masterfiles!") + if artifacts_data["Title"] != "Masterfiles ready-to-install tarball": # happens for 3.10.1, 3.9.2, 3.9.0, 3.8.2, 3.8.1, 3.8.0, 3.6.2--3.7.4 - if DEBUG: - check_analogous_urls(session, version) # 3.10.1: see above # 3.9.2: no masterfiles listed, but an analogous hidden URL exists - # 3.9.0 and others: no masterfiles listed, and an analogous hidden URLs seemingly do not exist + # 3.9.0 and others: no masterfiles listed, and analogous hidden URLs seemingly do not exist if version == "3.9.2": download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.9.2.pkg.tar.gz" else: download_url = None else: - download_url = subdd["URL"] - reported_checksums[version] = subdd["SHA256"] + download_url = artifacts_data["URL"] + reported_checksums[version] = artifacts_data["SHA256"] - print_debug(download_url) if download_url is not None: urls_dict[version] = download_url @@ -151,9 +78,9 @@ def download_all_versions_enterprise(): filename = url.split("/")[-1] tarball_path = version_path / filename - urlretrieve(url, tarball_path) + urllib.request.urlretrieve(url, tarball_path) - unpack_archive(tarball_path, version_path / "tarball") + shutil.unpack_archive(tarball_path, version_path / "tarball") # for local verification of the reported (Enterprise) (.pkg.tar.gz) checksums return downloaded_versions, reported_checksums diff --git a/masterfiles/generate_release_information.py b/masterfiles/generate_release_information.py index 9df5f7e7..cdfa4b4f 100644 --- a/masterfiles/generate_release_information.py +++ b/masterfiles/generate_release_information.py @@ -1,5 +1,5 @@ # TODO document `cfbs generate-release-information` -# this command uses several extra deps compared to the rest of cfbs +# it generates the .json data files in the cwd import sys from pathlib import Path @@ -7,14 +7,16 @@ from masterfiles.check_tarball_checksums import check_tarball_checksums from masterfiles.generate_vcf_download import generate_vcf_download from masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout -from masterfiles.check_download_matches_git import check_download_matches_git + +# commented out for now as this adds an extra dependency in its current state (dictdiffer) +# from masterfiles.check_download_matches_git import check_download_matches_git ENTERPRISE_PATH = Path("./enterprise") def generate_release_information(): - # only needs to be done once (although changes could happen afterwards), and silly to do if already have access to hosted files downloaded_versions, reported_checksums = download_all_versions_enterprise() + # TODO Community coverage: # downloaded_versions, reported_checksums = download_all_versions_community() # Enterprise 3.9.2 is downloaded but there is no reported checksum, so both args are necessary @@ -29,5 +31,6 @@ def generate_release_information(): generate_vcf_download(ENTERPRISE_PATH, downloaded_versions) generate_vcf_git_checkout(downloaded_versions) - check_download_matches_git(downloaded_versions) - # TODO automatic analysis of the difference-*.txts + # TODO automatic analysis of the difference between downloadable MPF data and git MPF data + # in its current state, this generates differences-*.txt files for each version + # check_download_matches_git(downloaded_versions) From 2768a80ff7341b1d1e7330180f1f5e66ac771605 Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Tue, 22 Oct 2024 19:55:08 +0200 Subject: [PATCH 03/12] Add unit tests and improve documentation Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- masterfiles/check_download_matches_git.py | 6 +++- tests/test_utils.py | 35 ++++++++++++++++++----- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/masterfiles/check_download_matches_git.py b/masterfiles/check_download_matches_git.py index 924f040f..d387d23f 100644 --- a/masterfiles/check_download_matches_git.py +++ b/masterfiles/check_download_matches_git.py @@ -1,4 +1,8 @@ -# check that the downloadable files match the git files, mitigating a build system supply-chain attack +"""Check that the downloadable files match the git files. + +This can be used to monitor / detect if something has been changed, accidentally or maliciously. +""" + import os import dictdiffer diff --git a/tests/test_utils.py b/tests/test_utils.py index d883e858..2d2ac517 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,10 +1,11 @@ -from cfbs.utils import canonify, deduplicate_def_json, merge_json, loads_bundlenames - - -def test_canonify(): - assert canonify("Hello CFEngine!") == "Hello_CFEngine_" - assert canonify("/etc/os-release") == "_etc_os_release" - assert canonify("my-example-module") == "my_example_module" +from cfbs.utils import ( + canonify, + deduplicate_def_json, + file_sha256, + merge_json, + loads_bundlenames, + string_sha256, +) def test_merge_json(): @@ -140,6 +141,26 @@ def test_deduplicate_def_json(): assert deduplicated == expected +def test_string_sha256(): + s = "cfbs/masterfiles/" + checksum = "9e63d3266f80328fb6547b3462e81ab55b13f689d6b0944e242e2b3a0f3a32a3" + + assert string_sha256(s) == checksum + + +def test_file_sha256(): + file_path = "tests/sample/foo/main.cf" + checksum = "da90bdfe7b5ee30e4d7871496e8434603315fb1b267660e2d49aee8ef47b246d" + + assert file_sha256(file_path) == checksum + + +def test_canonify(): + assert canonify("Hello CFEngine!") == "Hello_CFEngine_" + assert canonify("/etc/os-release") == "_etc_os_release" + assert canonify("my-example-module") == "my_example_module" + + def test_loads_bundlenames_single_bundle(): policy = """bundle agent bogus { From 3f0f1ed42b9971a87d0bf816fe3e9ed416fd7371 Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Tue, 22 Oct 2024 20:04:38 +0200 Subject: [PATCH 04/12] Move the `masterfiles` module into the `cfbs` module Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/commands.py | 2 +- {masterfiles => cfbs/masterfiles}/__init__.py | 0 {masterfiles => cfbs/masterfiles}/analyze.py | 0 .../masterfiles}/check_download_matches_git.py | 0 .../masterfiles}/check_tarball_checksums.py | 0 .../masterfiles}/download_all_versions.py | 0 .../masterfiles}/generate_release_information.py | 10 +++++----- .../masterfiles}/generate_vcf_download.py | 2 +- .../masterfiles}/generate_vcf_git_checkout.py | 2 +- 9 files changed, 8 insertions(+), 8 deletions(-) rename {masterfiles => cfbs/masterfiles}/__init__.py (100%) rename {masterfiles => cfbs/masterfiles}/analyze.py (100%) rename {masterfiles => cfbs/masterfiles}/check_download_matches_git.py (100%) rename {masterfiles => cfbs/masterfiles}/check_tarball_checksums.py (100%) rename {masterfiles => cfbs/masterfiles}/download_all_versions.py (100%) rename {masterfiles => cfbs/masterfiles}/generate_release_information.py (74%) rename {masterfiles => cfbs/masterfiles}/generate_vcf_download.py (91%) rename {masterfiles => cfbs/masterfiles}/generate_vcf_git_checkout.py (96%) diff --git a/cfbs/commands.py b/cfbs/commands.py index b709a706..799766c2 100644 --- a/cfbs/commands.py +++ b/cfbs/commands.py @@ -65,7 +65,7 @@ from cfbs.git_magic import Result, commit_after_command, git_commit_maybe_prompt from cfbs.prompts import YES_NO_CHOICES, prompt_user from cfbs.module import Module, is_module_added_manually -from masterfiles.generate_release_information import generate_release_information +from cfbs.masterfiles.generate_release_information import generate_release_information class InputDataUpdateFailed(Exception): diff --git a/masterfiles/__init__.py b/cfbs/masterfiles/__init__.py similarity index 100% rename from masterfiles/__init__.py rename to cfbs/masterfiles/__init__.py diff --git a/masterfiles/analyze.py b/cfbs/masterfiles/analyze.py similarity index 100% rename from masterfiles/analyze.py rename to cfbs/masterfiles/analyze.py diff --git a/masterfiles/check_download_matches_git.py b/cfbs/masterfiles/check_download_matches_git.py similarity index 100% rename from masterfiles/check_download_matches_git.py rename to cfbs/masterfiles/check_download_matches_git.py diff --git a/masterfiles/check_tarball_checksums.py b/cfbs/masterfiles/check_tarball_checksums.py similarity index 100% rename from masterfiles/check_tarball_checksums.py rename to cfbs/masterfiles/check_tarball_checksums.py diff --git a/masterfiles/download_all_versions.py b/cfbs/masterfiles/download_all_versions.py similarity index 100% rename from masterfiles/download_all_versions.py rename to cfbs/masterfiles/download_all_versions.py diff --git a/masterfiles/generate_release_information.py b/cfbs/masterfiles/generate_release_information.py similarity index 74% rename from masterfiles/generate_release_information.py rename to cfbs/masterfiles/generate_release_information.py index cdfa4b4f..e2bf8c7d 100644 --- a/masterfiles/generate_release_information.py +++ b/cfbs/masterfiles/generate_release_information.py @@ -3,13 +3,13 @@ import sys from pathlib import Path -from masterfiles.download_all_versions import download_all_versions_enterprise -from masterfiles.check_tarball_checksums import check_tarball_checksums -from masterfiles.generate_vcf_download import generate_vcf_download -from masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout +from cfbs.masterfiles.download_all_versions import download_all_versions_enterprise +from cfbs.masterfiles.check_tarball_checksums import check_tarball_checksums +from cfbs.masterfiles.generate_vcf_download import generate_vcf_download +from cfbs.masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout # commented out for now as this adds an extra dependency in its current state (dictdiffer) -# from masterfiles.check_download_matches_git import check_download_matches_git +# from cfbs.masterfiles.check_download_matches_git import check_download_matches_git ENTERPRISE_PATH = Path("./enterprise") diff --git a/masterfiles/generate_vcf_download.py b/cfbs/masterfiles/generate_vcf_download.py similarity index 91% rename from masterfiles/generate_vcf_download.py rename to cfbs/masterfiles/generate_vcf_download.py index 1dd3aa0f..b84f7bf3 100644 --- a/masterfiles/generate_vcf_download.py +++ b/cfbs/masterfiles/generate_vcf_download.py @@ -1,5 +1,5 @@ from cfbs.utils import write_json -from masterfiles.analyze import initialize_vcf, versions_checksums_files +from cfbs.masterfiles.analyze import initialize_vcf, versions_checksums_files def generate_vcf_download(dir_path, downloaded_versions): diff --git a/masterfiles/generate_vcf_git_checkout.py b/cfbs/masterfiles/generate_vcf_git_checkout.py similarity index 96% rename from masterfiles/generate_vcf_git_checkout.py rename to cfbs/masterfiles/generate_vcf_git_checkout.py index 25f398c8..6553c8e8 100644 --- a/masterfiles/generate_vcf_git_checkout.py +++ b/cfbs/masterfiles/generate_vcf_git_checkout.py @@ -5,7 +5,7 @@ from cfbs.git import git_exists from cfbs.utils import write_json -from masterfiles.analyze import initialize_vcf, versions_checksums_files +from cfbs.masterfiles.analyze import initialize_vcf, versions_checksums_files DIR_PATH = "." """The path of the working directory.""" From fd85e14355b2080b24044d77fbc08867bc85c41f Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Tue, 22 Oct 2024 21:05:58 +0200 Subject: [PATCH 05/12] Impose sorted order on the VCF data JSONs for more determinism Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/masterfiles/analyze.py | 21 ++++++++++++++++++- cfbs/masterfiles/generate_vcf_download.py | 10 ++++++++- cfbs/masterfiles/generate_vcf_git_checkout.py | 10 ++++++++- cfbs/utils.py | 6 ++++++ 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/cfbs/masterfiles/analyze.py b/cfbs/masterfiles/analyze.py index 2f09057f..59f1cc72 100644 --- a/cfbs/masterfiles/analyze.py +++ b/cfbs/masterfiles/analyze.py @@ -1,7 +1,7 @@ # TODO merge this with ENT-12099 branch cfbs analyze.py import os -from cfbs.utils import file_sha256 +from cfbs.utils import dict_sorted_by_key, file_sha256 IGNORED_PATH_COMPONENTS = [".git/", ".gitignore", ".gitattributes"] # ignore a path iff it contains a component (single file or directory) from this list @@ -52,3 +52,22 @@ def versions_checksums_files( ) return versions_dict, checksums_dict, files_dict + + +def finalize_vcf(versions_dict, checksums_dict, files_dict): + # sort checksums + sorted_checksums_dict = dict_sorted_by_key(checksums_dict["checksums"]) + checksums_dict["checksums"] = sorted_checksums_dict + + # sort files, alphabetically + sorted_files_dict = dict_sorted_by_key(files_dict["files"]) + files_dict["files"] = sorted_files_dict + + # sort files of each version + working_dict = versions_dict["versions"] + for k in working_dict.keys(): + sorted_dict = dict_sorted_by_key(working_dict[k]["files"]) + working_dict[k]["files"] = sorted_dict + versions_dict["versions"] = working_dict + + return versions_dict, checksums_dict, files_dict diff --git a/cfbs/masterfiles/generate_vcf_download.py b/cfbs/masterfiles/generate_vcf_download.py index b84f7bf3..60a96ceb 100644 --- a/cfbs/masterfiles/generate_vcf_download.py +++ b/cfbs/masterfiles/generate_vcf_download.py @@ -1,5 +1,9 @@ from cfbs.utils import write_json -from cfbs.masterfiles.analyze import initialize_vcf, versions_checksums_files +from cfbs.masterfiles.analyze import ( + finalize_vcf, + initialize_vcf, + versions_checksums_files, +) def generate_vcf_download(dir_path, downloaded_versions): @@ -16,6 +20,10 @@ def generate_vcf_download(dir_path, downloaded_versions): files_dir_path, version, versions_dict, checksums_dict, files_dict ) + versions_dict, checksums_dict, files_dict = finalize_vcf( + versions_dict, checksums_dict, files_dict + ) + write_json("versions.json", versions_dict) write_json("checksums.json", checksums_dict) write_json("files.json", files_dict) diff --git a/cfbs/masterfiles/generate_vcf_git_checkout.py b/cfbs/masterfiles/generate_vcf_git_checkout.py index 6553c8e8..bdad8d8c 100644 --- a/cfbs/masterfiles/generate_vcf_git_checkout.py +++ b/cfbs/masterfiles/generate_vcf_git_checkout.py @@ -5,7 +5,11 @@ from cfbs.git import git_exists from cfbs.utils import write_json -from cfbs.masterfiles.analyze import initialize_vcf, versions_checksums_files +from cfbs.masterfiles.analyze import ( + finalize_vcf, + initialize_vcf, + versions_checksums_files, +) DIR_PATH = "." """The path of the working directory.""" @@ -79,6 +83,10 @@ def generate_vcf_git_checkout(interesting_tags=None): MPF_PATH, tag, versions_dict, checksums_dict, files_dict ) + versions_dict, checksums_dict, files_dict = finalize_vcf( + versions_dict, checksums_dict, files_dict + ) + write_json("versions-git.json", versions_dict) write_json("checksums-git.json", checksums_dict) write_json("files-git.json", files_dict) diff --git a/cfbs/utils.py b/cfbs/utils.py index 83b3e97b..0e9e9a2d 100644 --- a/cfbs/utils.py +++ b/cfbs/utils.py @@ -236,6 +236,12 @@ def deduplicate_list(l): return list(OrderedDict.fromkeys(l)) +def dict_sorted_by_key(the_dict): + sorted_dict = OrderedDict(sorted(the_dict.items())) + + return sorted_dict + + def cfbs_filename() -> str: return "cfbs.json" From b516b19c038d149e10df342e60b71cb777cc34b5 Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Thu, 24 Oct 2024 19:37:58 +0200 Subject: [PATCH 06/12] Apply review suggestions Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/masterfiles/analyze.py | 10 ++++++---- cfbs/masterfiles/generate_vcf_git_checkout.py | 19 ++++++++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/cfbs/masterfiles/analyze.py b/cfbs/masterfiles/analyze.py index 59f1cc72..f618afde 100644 --- a/cfbs/masterfiles/analyze.py +++ b/cfbs/masterfiles/analyze.py @@ -3,11 +3,13 @@ from cfbs.utils import dict_sorted_by_key, file_sha256 +# TODO implement the ignoring IGNORED_PATH_COMPONENTS = [".git/", ".gitignore", ".gitattributes"] -# ignore a path iff it contains a component (single file or directory) from this list -# an element of this list should be just one component -# folders should end with '/', files should not -# TODO +"""The analysis ignores paths described by this list. A path will be ignored if and only if it contains a component (a single file or directory, anywhere in the path) from this list. + +Each element of this list should specify a singular component. +Folders should end with `/`, and files should not. +""" def initialize_vcf(): diff --git a/cfbs/masterfiles/generate_vcf_git_checkout.py b/cfbs/masterfiles/generate_vcf_git_checkout.py index bdad8d8c..12519be3 100644 --- a/cfbs/masterfiles/generate_vcf_git_checkout.py +++ b/cfbs/masterfiles/generate_vcf_git_checkout.py @@ -3,7 +3,6 @@ import subprocess import sys -from cfbs.git import git_exists from cfbs.utils import write_json from cfbs.masterfiles.analyze import ( finalize_vcf, @@ -18,7 +17,21 @@ MPF_PATH = os.path.join(DIR_PATH, "masterfiles") +def check_required_command(command): + if not shutil.which(command): + print("`%s` was not found" % command) + sys.exit(1) + + +def check_required_commands(commands): + for c in commands: + check_required_command(c) + + def generate_vcf_git_checkout(interesting_tags=None): + required_commands = ["git", "make", "automake", "autoconf"] + check_required_commands(required_commands) + # clone the MPF repo every time the script is run, in case there are updates if os.path.isdir(MPF_PATH): shutil.rmtree(MPF_PATH) @@ -29,10 +42,6 @@ def generate_vcf_git_checkout(interesting_tags=None): check=True, ) - if not git_exists(): - print("`git` was not found") - sys.exit(1) - result = subprocess.run( ["git", "tag"], cwd=MPF_PATH, capture_output=True, check=True ) From e6bc204ce9885f8c080c2f1e773484b7bf938d2f Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Wed, 13 Nov 2024 20:30:17 +0100 Subject: [PATCH 07/12] Apply review suggestions, reduce amount downloaded Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/masterfiles/analyze.py | 2 +- .../masterfiles/check_download_matches_git.py | 4 +- cfbs/masterfiles/check_tarball_checksums.py | 18 +-- cfbs/masterfiles/download_all_versions.py | 135 ++++++++++-------- .../generate_release_information.py | 14 +- cfbs/masterfiles/generate_vcf_download.py | 4 +- cfbs/masterfiles/generate_vcf_git_checkout.py | 45 +++--- 7 files changed, 121 insertions(+), 101 deletions(-) diff --git a/cfbs/masterfiles/analyze.py b/cfbs/masterfiles/analyze.py index f618afde..efb5d8c1 100644 --- a/cfbs/masterfiles/analyze.py +++ b/cfbs/masterfiles/analyze.py @@ -23,7 +23,7 @@ def initialize_vcf(): def versions_checksums_files( files_dir_path, version, versions_dict, checksums_dict, files_dict ): - for root, dirs, files in os.walk(files_dir_path): + for root, _, files in os.walk(files_dir_path): for name in files: full_relpath = os.path.join(root, name) tarball_relpath = os.path.relpath(full_relpath, files_dir_path) diff --git a/cfbs/masterfiles/check_download_matches_git.py b/cfbs/masterfiles/check_download_matches_git.py index d387d23f..e0749083 100644 --- a/cfbs/masterfiles/check_download_matches_git.py +++ b/cfbs/masterfiles/check_download_matches_git.py @@ -19,6 +19,6 @@ def check_download_matches_git(versions): download_version_dict = download_versions_dict["versions"][version]["files"] git_version_dict = git_versions_dict["versions"][version]["files"] - for diff in list(dictdiffer.diff(download_version_dict, git_version_dict)): - with open("differences/difference-" + version + ".txt", "w") as f: + with open("differences/difference-" + version + ".txt", "w") as f: + for diff in dictdiffer.diff(download_version_dict, git_version_dict): print(diff, file=f) diff --git a/cfbs/masterfiles/check_tarball_checksums.py b/cfbs/masterfiles/check_tarball_checksums.py index d7e615f6..a4b647fc 100644 --- a/cfbs/masterfiles/check_tarball_checksums.py +++ b/cfbs/masterfiles/check_tarball_checksums.py @@ -1,27 +1,29 @@ +import os + from cfbs.utils import file_sha256, immediate_files def check_tarball_checksums(dir_path, downloaded_versions, reported_checksums): does_match = True + print("Verifying checksums...") + for version in downloaded_versions: - print(version) + if version in ("3.10.0", "3.9.2"): + # 3.10.0 lists a .tar.gz, not a .pkg.tar.gz + # 3.9.2 lists no masterfiles + continue - version_path = dir_path / version + version_path = os.path.join(dir_path, version) versions_files = immediate_files(version_path) # the tarball should be the only file in the version's directory tarball_name = versions_files[0] - tarball_path = version_path / tarball_name + tarball_path = os.path.join(version_path, tarball_name) tarball_checksum = file_sha256(tarball_path) - if version in ("3.10.0", "3.9.2"): - # 3.10.0 lists a .tar.gz, not a .pkg.tar.gz - # 3.9.2 lists no masterfiles - continue - reported_checksum = reported_checksums[version] if tarball_checksum != reported_checksum: diff --git a/cfbs/masterfiles/download_all_versions.py b/cfbs/masterfiles/download_all_versions.py index 58b9f0cc..f0c3c64c 100644 --- a/cfbs/masterfiles/download_all_versions.py +++ b/cfbs/masterfiles/download_all_versions.py @@ -1,86 +1,99 @@ -from pathlib import Path +import os import shutil -import urllib.request -from cfbs.utils import get_json - -DOWNLOAD = True +from cfbs.utils import fetch_url, get_json, mkdir ENTERPRISE_URL = "https://cfengine.com/release-data/enterprise/releases.json" COMMUNITY_URL = "https://cfengine.com/release-data/community/releases.json" -# TODO -# def download_all_versions_community(): -# data = get_json(COMMUNITY_URL) -# # "masterfiles is at a different index" in 3.10.1 happens only for Enterprise, not Community - +ENTERPRISE_DOWNLOAD_PATH = "enterprise" -def download_all_versions_enterprise(): - data = get_json(ENTERPRISE_URL) - urls_dict = {} +def get_download_urls_enterprise(): + download_urls = {} reported_checksums = {} - for releases_data in data["releases"]: - version = releases_data["version"] - release_url = releases_data["URL"] + data = get_json(ENTERPRISE_URL) - subdata = get_json(release_url) + for release_data in data["releases"]: + version = release_data["version"] + if version == "3.10.0": + # for 3.10.0, for some reason, the masterfiles download link points to the .tar.gz tarball, rather than the .pkg.tar.gz tarball + # download the .pkg.tar.gz from an unlisted analoguous URL instead + download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.10.0.pkg.tar.gz" + download_urls[version] = download_url + continue + if version == "3.9.2": + # for 3.9.2, no masterfiles are listed, but an unlisted analoguous URL exists + download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.9.2.pkg.tar.gz" + download_urls[version] = download_url + continue + + release_url = release_data["URL"] + subdata = get_json(release_url) artifacts_data = subdata["artifacts"] + if "Additional Assets" not in artifacts_data: # happens for 3.9.0b1, 3.8.0b1, 3.6.1, 3.6.0 - download_url = None + continue - else: - # for 3.10.0, for some reason, the masterfiles download link points to the .tar.gz tarball, rather than the .pkg.tar.gz tarball - # here, download the .pkg.tar.gz from a hidden analoguous URL instead - if version == "3.10.0": - download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.10.0.pkg.tar.gz" - else: - # there's precisely one version (3.10.1) for which masterfiles is at a different index - if version == "3.10.1": - artifacts_data = artifacts_data["Additional Assets"][1] - else: - artifacts_data = artifacts_data["Additional Assets"][0] - - if artifacts_data["Title"] != "Masterfiles ready-to-install tarball": - # happens for 3.10.1, 3.9.2, 3.9.0, 3.8.2, 3.8.1, 3.8.0, 3.6.2--3.7.4 - # 3.10.1: see above - # 3.9.2: no masterfiles listed, but an analogous hidden URL exists - # 3.9.0 and others: no masterfiles listed, and analogous hidden URLs seemingly do not exist - if version == "3.9.2": - download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.9.2.pkg.tar.gz" - else: - download_url = None - else: - download_url = artifacts_data["URL"] - reported_checksums[version] = artifacts_data["SHA256"] - - if download_url is not None: - urls_dict[version] = download_url + assets_data = artifacts_data["Additional Assets"] + masterfiles_data = None + + for asset in assets_data: + if asset["Title"] == "Masterfiles ready-to-install tarball": + masterfiles_data = asset + + if masterfiles_data is None: + # happens for 3.9.2, 3.9.0, 3.8.2, 3.8.1, 3.8.0, 3.7.4--3.6.2 + # 3.9.2: see above + # 3.9.0 and below: no masterfiles listed, and analogous unlisted URLs seemingly do not exist + continue + download_urls[version] = masterfiles_data["URL"] + reported_checksums[version] = masterfiles_data["SHA256"] + + return download_urls, reported_checksums + + +def download_versions_from_urls(output_path, download_urls): downloaded_versions = [] - if DOWNLOAD: - root_path = Path("./enterprise") - Path.mkdir(root_path, exist_ok=True) - for version, url in urls_dict.items(): - # ignore master and .x versions - if url.startswith("http://buildcache"): - continue + mkdir(output_path) + + for version, url in download_urls.items(): + # ignore master and .x versions + if url.startswith("http://buildcache"): + continue + + print("Downloading from", url) + downloaded_versions.append(version) + + version_path = os.path.join(output_path, version) + mkdir(version_path) + + filename = url.split("/")[-1] + tarball_path = os.path.join(version_path, filename) + fetch_url(url, tarball_path) - downloaded_versions.append(version) - print(url) + tarball_dir_path = os.path.join(version_path, "tarball") + shutil.unpack_archive(tarball_path, tarball_dir_path) - version_path = root_path / version - Path.mkdir(version_path, exist_ok=True) + return output_path, downloaded_versions - filename = url.split("/")[-1] - tarball_path = version_path / filename - urllib.request.urlretrieve(url, tarball_path) - shutil.unpack_archive(tarball_path, version_path / "tarball") +# TODO +# def download_all_versions_community(): +# data = get_json(COMMUNITY_URL) + + +def download_all_versions_enterprise(): + download_urls, reported_checksums = get_download_urls_enterprise() + + output_path, downloaded_versions = download_versions_from_urls( + ENTERPRISE_DOWNLOAD_PATH, download_urls + ) # for local verification of the reported (Enterprise) (.pkg.tar.gz) checksums - return downloaded_versions, reported_checksums + return output_path, downloaded_versions, reported_checksums diff --git a/cfbs/masterfiles/generate_release_information.py b/cfbs/masterfiles/generate_release_information.py index e2bf8c7d..c89016fc 100644 --- a/cfbs/masterfiles/generate_release_information.py +++ b/cfbs/masterfiles/generate_release_information.py @@ -1,7 +1,6 @@ # TODO document `cfbs generate-release-information` # it generates the .json data files in the cwd import sys -from pathlib import Path from cfbs.masterfiles.download_all_versions import download_all_versions_enterprise from cfbs.masterfiles.check_tarball_checksums import check_tarball_checksums @@ -11,24 +10,23 @@ # commented out for now as this adds an extra dependency in its current state (dictdiffer) # from cfbs.masterfiles.check_download_matches_git import check_download_matches_git -ENTERPRISE_PATH = Path("./enterprise") - def generate_release_information(): - downloaded_versions, reported_checksums = download_all_versions_enterprise() + print("Downloading Enterprise masterfiles...") + output_path, downloaded_versions, reported_checksums = ( + download_all_versions_enterprise() + ) # TODO Community coverage: # downloaded_versions, reported_checksums = download_all_versions_community() # Enterprise 3.9.2 is downloaded but there is no reported checksum, so both args are necessary - if check_tarball_checksums( - ENTERPRISE_PATH, downloaded_versions, reported_checksums - ): + if check_tarball_checksums(output_path, downloaded_versions, reported_checksums): print("Every checksum matches") else: print("Checksums differ!") sys.exit(1) - generate_vcf_download(ENTERPRISE_PATH, downloaded_versions) + generate_vcf_download(output_path, downloaded_versions) generate_vcf_git_checkout(downloaded_versions) # TODO automatic analysis of the difference between downloadable MPF data and git MPF data diff --git a/cfbs/masterfiles/generate_vcf_download.py b/cfbs/masterfiles/generate_vcf_download.py index 60a96ceb..0e758369 100644 --- a/cfbs/masterfiles/generate_vcf_download.py +++ b/cfbs/masterfiles/generate_vcf_download.py @@ -1,3 +1,5 @@ +import os + from cfbs.utils import write_json from cfbs.masterfiles.analyze import ( finalize_vcf, @@ -14,7 +16,7 @@ def generate_vcf_download(dir_path, downloaded_versions): versions_dict, checksums_dict, files_dict = initialize_vcf() for version in downloaded_versions: - files_dir_path = dir_path / version / "tarball" + files_dir_path = os.path.join(dir_path, version, "tarball") versions_dict, checksums_dict, files_dict = versions_checksums_files( files_dir_path, version, versions_dict, checksums_dict, files_dict diff --git a/cfbs/masterfiles/generate_vcf_git_checkout.py b/cfbs/masterfiles/generate_vcf_git_checkout.py index 12519be3..84f20761 100644 --- a/cfbs/masterfiles/generate_vcf_git_checkout.py +++ b/cfbs/masterfiles/generate_vcf_git_checkout.py @@ -32,15 +32,19 @@ def generate_vcf_git_checkout(interesting_tags=None): required_commands = ["git", "make", "automake", "autoconf"] check_required_commands(required_commands) - # clone the MPF repo every time the script is run, in case there are updates - if os.path.isdir(MPF_PATH): - shutil.rmtree(MPF_PATH) - - subprocess.run( - ["git", "clone", MPF_URL], - cwd=DIR_PATH, - check=True, - ) + # get the current version of the MPF repo + if not os.path.isdir(MPF_PATH): + subprocess.run( + ["git", "clone", "--no-checkout", MPF_URL], + cwd=DIR_PATH, + check=True, + ) + else: + subprocess.run( + ["git", "fetch", "--all"], + cwd=DIR_PATH, + check=True, + ) result = subprocess.run( ["git", "tag"], cwd=MPF_PATH, capture_output=True, check=True @@ -58,19 +62,11 @@ def generate_vcf_git_checkout(interesting_tags=None): versions_dict, checksums_dict, files_dict = initialize_vcf() for tag in interesting_tags: - print(tag) + print("Checkouting tag", tag) - # checkout the version tag - subprocess.run( - ["git", "checkout", "--force", tag], - cwd=MPF_PATH, - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - # a clean is necessary to remove all the undesired files + # checkout the version subprocess.run( - ["git", "clean", "-dffx"], + ["git", "checkout", tag], cwd=MPF_PATH, check=True, stdout=subprocess.DEVNULL, @@ -92,6 +88,15 @@ def generate_vcf_git_checkout(interesting_tags=None): MPF_PATH, tag, versions_dict, checksums_dict, files_dict ) + # clean the files to prevent spillage to other versions + subprocess.run( + ["git", "clean", "-dffx"], + cwd=MPF_PATH, + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + versions_dict, checksums_dict, files_dict = finalize_vcf( versions_dict, checksums_dict, files_dict ) From 4c02fbbf3f4b7bfedd56d19efde4977f344fd58d Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Thu, 14 Nov 2024 20:41:11 +0100 Subject: [PATCH 08/12] Add checksums for unlisted URLs, verify checksums earlier Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/masterfiles/check_tarball_checksums.py | 36 ------------------- cfbs/masterfiles/download_all_versions.py | 28 ++++++++++----- .../generate_release_information.py | 14 +++----- 3 files changed, 23 insertions(+), 55 deletions(-) delete mode 100644 cfbs/masterfiles/check_tarball_checksums.py diff --git a/cfbs/masterfiles/check_tarball_checksums.py b/cfbs/masterfiles/check_tarball_checksums.py deleted file mode 100644 index a4b647fc..00000000 --- a/cfbs/masterfiles/check_tarball_checksums.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -from cfbs.utils import file_sha256, immediate_files - - -def check_tarball_checksums(dir_path, downloaded_versions, reported_checksums): - does_match = True - - print("Verifying checksums...") - - for version in downloaded_versions: - if version in ("3.10.0", "3.9.2"): - # 3.10.0 lists a .tar.gz, not a .pkg.tar.gz - # 3.9.2 lists no masterfiles - continue - - version_path = os.path.join(dir_path, version) - - versions_files = immediate_files(version_path) - # the tarball should be the only file in the version's directory - tarball_name = versions_files[0] - - tarball_path = os.path.join(version_path, tarball_name) - - tarball_checksum = file_sha256(tarball_path) - - reported_checksum = reported_checksums[version] - - if tarball_checksum != reported_checksum: - does_match = False - print("* checksum difference:") - print(version) - print(tarball_checksum) - print(reported_checksum) - - return does_match diff --git a/cfbs/masterfiles/download_all_versions.py b/cfbs/masterfiles/download_all_versions.py index f0c3c64c..651637ca 100644 --- a/cfbs/masterfiles/download_all_versions.py +++ b/cfbs/masterfiles/download_all_versions.py @@ -1,7 +1,7 @@ import os import shutil -from cfbs.utils import fetch_url, get_json, mkdir +from cfbs.utils import FetchError, fetch_url, get_json, mkdir, user_error ENTERPRISE_URL = "https://cfengine.com/release-data/enterprise/releases.json" COMMUNITY_URL = "https://cfengine.com/release-data/community/releases.json" @@ -13,21 +13,27 @@ def get_download_urls_enterprise(): download_urls = {} reported_checksums = {} + print("* gathering download URLs...") + data = get_json(ENTERPRISE_URL) for release_data in data["releases"]: version = release_data["version"] if version == "3.10.0": - # for 3.10.0, for some reason, the masterfiles download link points to the .tar.gz tarball, rather than the .pkg.tar.gz tarball - # download the .pkg.tar.gz from an unlisted analoguous URL instead + # for 3.10.0, for some reason, the "Masterfiles ready-to-install tarball" is a .tar.gz tarball, rather than a .pkg.tar.gz tarball + # download the .pkg.tar.gz tarball from an unlisted analoguous URL instead download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.10.0.pkg.tar.gz" + digest = "7b5e237529e11ce4ae295922dad1a681f13b95f3a7d247d39d3f5088f1a1d7d3" download_urls[version] = download_url + reported_checksums[version] = digest continue if version == "3.9.2": # for 3.9.2, no masterfiles are listed, but an unlisted analoguous URL exists download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.9.2.pkg.tar.gz" + digest = "ae1a758530d4a4aad5b6812b61fc37ad1b5900b755f88a1ab98da7fd05a9f5cc" download_urls[version] = download_url + reported_checksums[version] = digest continue release_url = release_data["URL"] @@ -57,7 +63,7 @@ def get_download_urls_enterprise(): return download_urls, reported_checksums -def download_versions_from_urls(output_path, download_urls): +def download_versions_from_urls(output_path, download_urls, reported_checksums): downloaded_versions = [] mkdir(output_path) @@ -67,15 +73,20 @@ def download_versions_from_urls(output_path, download_urls): if url.startswith("http://buildcache"): continue - print("Downloading from", url) + print("* downloading from", url) downloaded_versions.append(version) version_path = os.path.join(output_path, version) mkdir(version_path) + # download a version, and verify the reported checksum matches filename = url.split("/")[-1] tarball_path = os.path.join(version_path, filename) - fetch_url(url, tarball_path) + checksum = reported_checksums[version] + try: + fetch_url(url, tarball_path, checksum) + except FetchError as e: + user_error("For version " + version + ": " + str(e)) tarball_dir_path = os.path.join(version_path, "tarball") shutil.unpack_archive(tarball_path, tarball_dir_path) @@ -92,8 +103,7 @@ def download_all_versions_enterprise(): download_urls, reported_checksums = get_download_urls_enterprise() output_path, downloaded_versions = download_versions_from_urls( - ENTERPRISE_DOWNLOAD_PATH, download_urls + ENTERPRISE_DOWNLOAD_PATH, download_urls, reported_checksums ) - # for local verification of the reported (Enterprise) (.pkg.tar.gz) checksums - return output_path, downloaded_versions, reported_checksums + return output_path, downloaded_versions diff --git a/cfbs/masterfiles/generate_release_information.py b/cfbs/masterfiles/generate_release_information.py index c89016fc..35df2284 100644 --- a/cfbs/masterfiles/generate_release_information.py +++ b/cfbs/masterfiles/generate_release_information.py @@ -3,7 +3,6 @@ import sys from cfbs.masterfiles.download_all_versions import download_all_versions_enterprise -from cfbs.masterfiles.check_tarball_checksums import check_tarball_checksums from cfbs.masterfiles.generate_vcf_download import generate_vcf_download from cfbs.masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout @@ -13,18 +12,13 @@ def generate_release_information(): print("Downloading Enterprise masterfiles...") - output_path, downloaded_versions, reported_checksums = ( - download_all_versions_enterprise() - ) + + output_path, downloaded_versions = download_all_versions_enterprise() # TODO Community coverage: # downloaded_versions, reported_checksums = download_all_versions_community() - # Enterprise 3.9.2 is downloaded but there is no reported checksum, so both args are necessary - if check_tarball_checksums(output_path, downloaded_versions, reported_checksums): - print("Every checksum matches") - else: - print("Checksums differ!") - sys.exit(1) + print("Download finished. Every reported checksum matches.") + print("Generating release information...") generate_vcf_download(output_path, downloaded_versions) generate_vcf_git_checkout(downloaded_versions) From 301909b60847b141815ecd97d6d17d8a352f19a7 Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Fri, 15 Nov 2024 19:10:56 +0100 Subject: [PATCH 09/12] Fix bug in git fetch, add more masterfiles versions, improve code quality Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/masterfiles/analyze.py | 1 - cfbs/masterfiles/download_all_versions.py | 44 +++++++++++-------- .../generate_release_information.py | 14 ++---- cfbs/masterfiles/generate_vcf_git_checkout.py | 21 ++------- 4 files changed, 34 insertions(+), 46 deletions(-) diff --git a/cfbs/masterfiles/analyze.py b/cfbs/masterfiles/analyze.py index efb5d8c1..de9f3b44 100644 --- a/cfbs/masterfiles/analyze.py +++ b/cfbs/masterfiles/analyze.py @@ -1,4 +1,3 @@ -# TODO merge this with ENT-12099 branch cfbs analyze.py import os from cfbs.utils import dict_sorted_by_key, file_sha256 diff --git a/cfbs/masterfiles/download_all_versions.py b/cfbs/masterfiles/download_all_versions.py index 651637ca..99ce4d4a 100644 --- a/cfbs/masterfiles/download_all_versions.py +++ b/cfbs/masterfiles/download_all_versions.py @@ -3,10 +3,9 @@ from cfbs.utils import FetchError, fetch_url, get_json, mkdir, user_error -ENTERPRISE_URL = "https://cfengine.com/release-data/enterprise/releases.json" -COMMUNITY_URL = "https://cfengine.com/release-data/community/releases.json" +ENTERPRISE_RELEASES_URL = "https://cfengine.com/release-data/enterprise/releases.json" -ENTERPRISE_DOWNLOAD_PATH = "enterprise" +DOWNLOAD_PATH = "downloaded_masterfiles" def get_download_urls_enterprise(): @@ -15,7 +14,7 @@ def get_download_urls_enterprise(): print("* gathering download URLs...") - data = get_json(ENTERPRISE_URL) + data = get_json(ENTERPRISE_RELEASES_URL) for release_data in data["releases"]: version = release_data["version"] @@ -54,7 +53,7 @@ def get_download_urls_enterprise(): if masterfiles_data is None: # happens for 3.9.2, 3.9.0, 3.8.2, 3.8.1, 3.8.0, 3.7.4--3.6.2 # 3.9.2: see above - # 3.9.0 and below: no masterfiles listed, and analogous unlisted URLs seemingly do not exist + # 3.9.0 and below: no masterfiles listed, and unlisted analogous URLs seemingly do not exist continue download_urls[version] = masterfiles_data["URL"] @@ -63,10 +62,10 @@ def get_download_urls_enterprise(): return download_urls, reported_checksums -def download_versions_from_urls(output_path, download_urls, reported_checksums): +def download_versions_from_urls(download_path, download_urls, reported_checksums): downloaded_versions = [] - mkdir(output_path) + mkdir(download_path) for version, url in download_urls.items(): # ignore master and .x versions @@ -76,7 +75,7 @@ def download_versions_from_urls(output_path, download_urls, reported_checksums): print("* downloading from", url) downloaded_versions.append(version) - version_path = os.path.join(output_path, version) + version_path = os.path.join(download_path, version) mkdir(version_path) # download a version, and verify the reported checksum matches @@ -91,19 +90,28 @@ def download_versions_from_urls(output_path, download_urls, reported_checksums): tarball_dir_path = os.path.join(version_path, "tarball") shutil.unpack_archive(tarball_path, tarball_dir_path) - return output_path, downloaded_versions + return downloaded_versions -# TODO -# def download_all_versions_community(): -# data = get_json(COMMUNITY_URL) - - -def download_all_versions_enterprise(): +def download_all_versions(): download_urls, reported_checksums = get_download_urls_enterprise() - output_path, downloaded_versions = download_versions_from_urls( - ENTERPRISE_DOWNLOAD_PATH, download_urls, reported_checksums + # add masterfiles versions which do not appear in Enterprise releases but appear in Community releases + # 3.12.0b1 + version = "3.12.0b1" + download_url = "https://cfengine-package-repos.s3.amazonaws.com/community_binaries/Community-3.12.0b1/misc/cfengine-masterfiles-3.12.0b1.pkg.tar.gz" + digest = "ede305dae7be3edfac04fc5b7f63b46adb3a5b1612f4755e855ee8e6b8d344d7" + download_urls[version] = download_url + reported_checksums[version] = digest + # 3.10.0b1 + version = "3.10.0b1" + download_url = "https://cfengine-package-repos.s3.amazonaws.com/tarballs/cfengine-masterfiles-3.10.0b1.pkg.tar.gz" + digest = "09291617254705d79dea2531b23dbd0754f09029e90ce0b43b275aa02c1223a3" + download_urls[version] = download_url + reported_checksums[version] = digest + + downloaded_versions = download_versions_from_urls( + DOWNLOAD_PATH, download_urls, reported_checksums ) - return output_path, downloaded_versions + return DOWNLOAD_PATH, downloaded_versions diff --git a/cfbs/masterfiles/generate_release_information.py b/cfbs/masterfiles/generate_release_information.py index 35df2284..785d9fe6 100644 --- a/cfbs/masterfiles/generate_release_information.py +++ b/cfbs/masterfiles/generate_release_information.py @@ -1,8 +1,4 @@ -# TODO document `cfbs generate-release-information` -# it generates the .json data files in the cwd -import sys - -from cfbs.masterfiles.download_all_versions import download_all_versions_enterprise +from cfbs.masterfiles.download_all_versions import download_all_versions from cfbs.masterfiles.generate_vcf_download import generate_vcf_download from cfbs.masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout @@ -11,16 +7,14 @@ def generate_release_information(): - print("Downloading Enterprise masterfiles...") + print("Downloading masterfiles...") - output_path, downloaded_versions = download_all_versions_enterprise() - # TODO Community coverage: - # downloaded_versions, reported_checksums = download_all_versions_community() + download_path, downloaded_versions = download_all_versions() print("Download finished. Every reported checksum matches.") print("Generating release information...") - generate_vcf_download(output_path, downloaded_versions) + generate_vcf_download(download_path, downloaded_versions) generate_vcf_git_checkout(downloaded_versions) # TODO automatic analysis of the difference between downloadable MPF data and git MPF data diff --git a/cfbs/masterfiles/generate_vcf_git_checkout.py b/cfbs/masterfiles/generate_vcf_git_checkout.py index 84f20761..a6b007cb 100644 --- a/cfbs/masterfiles/generate_vcf_git_checkout.py +++ b/cfbs/masterfiles/generate_vcf_git_checkout.py @@ -28,7 +28,7 @@ def check_required_commands(commands): check_required_command(c) -def generate_vcf_git_checkout(interesting_tags=None): +def generate_vcf_git_checkout(checkout_tags): required_commands = ["git", "make", "automake", "autoconf"] check_required_commands(required_commands) @@ -42,26 +42,13 @@ def generate_vcf_git_checkout(interesting_tags=None): else: subprocess.run( ["git", "fetch", "--all"], - cwd=DIR_PATH, + cwd=MPF_PATH, check=True, ) - result = subprocess.run( - ["git", "tag"], cwd=MPF_PATH, capture_output=True, check=True - ) - tags = result.stdout.decode("UTF-8").splitlines() - - # if not given, choose tags to checkout - by default, only consider version releases - if interesting_tags is None: - interesting_tags = [] - - for tag in tags: - if "-" not in tag: - interesting_tags.append(tag) - versions_dict, checksums_dict, files_dict = initialize_vcf() - for tag in interesting_tags: + for tag in checkout_tags: print("Checkouting tag", tag) # checkout the version @@ -90,7 +77,7 @@ def generate_vcf_git_checkout(interesting_tags=None): # clean the files to prevent spillage to other versions subprocess.run( - ["git", "clean", "-dffx"], + ["git", "clean", "-dfx"], cwd=MPF_PATH, check=True, stdout=subprocess.DEVNULL, From 97b0c30ad6ee4804ceba1f547d7a520172e0e6fc Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Wed, 20 Nov 2024 18:13:39 +0100 Subject: [PATCH 10/12] Implement verification of downloadable files matching git files without additional dependencies Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/masterfiles/analyze.py | 8 ---- .../masterfiles/check_download_matches_git.py | 40 ++++++++++++++----- .../generate_release_information.py | 14 ++++--- cfbs/masterfiles/generate_vcf_git_checkout.py | 24 ++++++++++- cfbs/utils.py | 16 ++++++++ 5 files changed, 77 insertions(+), 25 deletions(-) diff --git a/cfbs/masterfiles/analyze.py b/cfbs/masterfiles/analyze.py index de9f3b44..5abe9be5 100644 --- a/cfbs/masterfiles/analyze.py +++ b/cfbs/masterfiles/analyze.py @@ -2,14 +2,6 @@ from cfbs.utils import dict_sorted_by_key, file_sha256 -# TODO implement the ignoring -IGNORED_PATH_COMPONENTS = [".git/", ".gitignore", ".gitattributes"] -"""The analysis ignores paths described by this list. A path will be ignored if and only if it contains a component (a single file or directory, anywhere in the path) from this list. - -Each element of this list should specify a singular component. -Folders should end with `/`, and files should not. -""" - def initialize_vcf(): versions_dict = {"versions": {}} diff --git a/cfbs/masterfiles/check_download_matches_git.py b/cfbs/masterfiles/check_download_matches_git.py index e0749083..a90cc87c 100644 --- a/cfbs/masterfiles/check_download_matches_git.py +++ b/cfbs/masterfiles/check_download_matches_git.py @@ -1,15 +1,16 @@ -"""Check that the downloadable files match the git files. - -This can be used to monitor / detect if something has been changed, accidentally or maliciously. -""" - import os -import dictdiffer -from cfbs.utils import read_json +from cfbs.utils import dict_diff, read_json, user_error def check_download_matches_git(versions): + """Check that the downloadable files match the git files. + + This can be used to monitor / detect if something has been changed, accidentally or maliciously. + + Generates a `differences-*.txt` file for each version. + """ + download_versions_dict = read_json("versions.json") git_versions_dict = read_json("versions-git.json") @@ -19,6 +20,27 @@ def check_download_matches_git(versions): download_version_dict = download_versions_dict["versions"][version]["files"] git_version_dict = git_versions_dict["versions"][version]["files"] + # normalize downloaded version dictionary filepaths + # necessary because the downloaded version and git version dictionaries have filepaths of different forms + new_download_dict = {} + for key, value in download_version_dict.items(): + if key.startswith("masterfiles/"): + key = key[12:] + new_download_dict[key] = value + download_version_dict = new_download_dict + with open("differences/difference-" + version + ".txt", "w") as f: - for diff in dictdiffer.diff(download_version_dict, git_version_dict): - print(diff, file=f) + only_dl, only_git, value_diff = dict_diff( + download_version_dict, git_version_dict + ) + + print("Files only in the downloaded version:", only_dl, file=f) + print("Files only in the git version:", only_git, file=f) + print("Files with different contents:", value_diff, file=f) + + if len(only_dl) > 0 or len(value_diff) > 0: + user_error( + "Downloadable files of version " + + version + + " do not match git files" + ) diff --git a/cfbs/masterfiles/generate_release_information.py b/cfbs/masterfiles/generate_release_information.py index 785d9fe6..698403b7 100644 --- a/cfbs/masterfiles/generate_release_information.py +++ b/cfbs/masterfiles/generate_release_information.py @@ -1,9 +1,7 @@ from cfbs.masterfiles.download_all_versions import download_all_versions from cfbs.masterfiles.generate_vcf_download import generate_vcf_download from cfbs.masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout - -# commented out for now as this adds an extra dependency in its current state (dictdiffer) -# from cfbs.masterfiles.check_download_matches_git import check_download_matches_git +from cfbs.masterfiles.check_download_matches_git import check_download_matches_git def generate_release_information(): @@ -17,6 +15,10 @@ def generate_release_information(): generate_vcf_download(download_path, downloaded_versions) generate_vcf_git_checkout(downloaded_versions) - # TODO automatic analysis of the difference between downloadable MPF data and git MPF data - # in its current state, this generates differences-*.txt files for each version - # check_download_matches_git(downloaded_versions) + print("Candidate release information generated.") + print("Checking that downloadable files match git files...") + + check_download_matches_git(downloaded_versions) + + print("Downloadable files match git files.") + print("Release information generation successfully finished.") diff --git a/cfbs/masterfiles/generate_vcf_git_checkout.py b/cfbs/masterfiles/generate_vcf_git_checkout.py index a6b007cb..eec53ec3 100644 --- a/cfbs/masterfiles/generate_vcf_git_checkout.py +++ b/cfbs/masterfiles/generate_vcf_git_checkout.py @@ -51,9 +51,22 @@ def generate_vcf_git_checkout(checkout_tags): for tag in checkout_tags: print("Checkouting tag", tag) + # checkouting some tags equal to the downloaded version doesn't result in the same files + # the downloadable files are reproducible by checkouting specific tags + if tag == "3.18.0": + checkout_tag = "3.18.0-2" + elif tag == "3.15.4": + checkout_tag = "3.15.4-2-build2" + elif tag == "3.12.3": + checkout_tag = "3.12.3-build7" + elif tag == "3.7.7": + checkout_tag = "3.7.7-build1" + else: + checkout_tag = tag + # checkout the version subprocess.run( - ["git", "checkout", tag], + ["git", "checkout", checkout_tag], cwd=MPF_PATH, check=True, stdout=subprocess.DEVNULL, @@ -61,11 +74,18 @@ def generate_vcf_git_checkout(checkout_tags): ) # build masterfiles from git as they are in the tarball packages + # for the files of this version to be reproducible, the `EXPLICIT_RELEASE` environment variable needs to be set to what it was when the downloadable files were built + if tag == "3.18.3": + release_number = "2" + else: + release_number = "1" subprocess.run( ["./autogen.sh"], cwd=MPF_PATH, check=True, - env=dict(os.environ.copy(), EXPLICIT_VERSION=tag), + env=dict( + os.environ.copy(), EXPLICIT_VERSION=tag, EXPLICIT_RELEASE=release_number + ), ) # older masterfiles version READMEs instruct to use `make install` and newer `make` - always use `make` instead subprocess.run(["make"], cwd=MPF_PATH, check=True) diff --git a/cfbs/utils.py b/cfbs/utils.py index 0e9e9a2d..b1a1f5af 100644 --- a/cfbs/utils.py +++ b/cfbs/utils.py @@ -242,6 +242,22 @@ def dict_sorted_by_key(the_dict): return sorted_dict +def dict_diff(A, B): + keys_A = set(A.keys()) + keys_B = set(B.keys()) + keys_in_both = keys_A & keys_B + keys_only_A = keys_A - keys_in_both + keys_only_B = keys_B - keys_in_both + + values_different = set((k, A[k], B[k]) for k in keys_in_both if A[k] != B[k]) + + keys_only_A = sorted(keys_only_A) + keys_only_B = sorted(keys_only_B) + values_different = sorted(values_different) + + return keys_only_A, keys_only_B, values_different + + def cfbs_filename() -> str: return "cfbs.json" From 879fac3947fe6310274c5d2737fbaaf9caca9f70 Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Thu, 21 Nov 2024 16:27:40 +0100 Subject: [PATCH 11/12] Add unit test, a docstring, and close opened file Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- cfbs/utils.py | 8 +++++++- tests/test_utils.py | 8 ++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cfbs/utils.py b/cfbs/utils.py index b1a1f5af..a143c0da 100644 --- a/cfbs/utils.py +++ b/cfbs/utils.py @@ -243,6 +243,11 @@ def dict_sorted_by_key(the_dict): def dict_diff(A, B): + """Returns three sorted lists: + * first: list of keys only in `A` + * second: list of keys only in `B` + * third: list of tuples `(k, A[k], B[k])` for keys `k` in both with differing values + """ keys_A = set(A.keys()) keys_B = set(B.keys()) keys_in_both = keys_A & keys_B @@ -315,7 +320,8 @@ def string_sha256(input): def file_sha256(file): h = hashlib.sha256() - h.update(open(file, "rb").read()) + with open(file, "rb") as f: + h.update(f.read()) return h.hexdigest() diff --git a/tests/test_utils.py b/tests/test_utils.py index 2d2ac517..edba41a6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ from cfbs.utils import ( canonify, deduplicate_def_json, + dict_diff, file_sha256, merge_json, loads_bundlenames, @@ -141,6 +142,13 @@ def test_deduplicate_def_json(): assert deduplicated == expected +def test_dict_diff(): + A = {"A": "a", "B": "b", "C": "c"} + B = {"A": "a", "B": "c", "D": "d"} + + assert dict_diff(A, B) == (["C"], ["D"], [("B", "b", "c")]) + + def test_string_sha256(): s = "cfbs/masterfiles/" checksum = "9e63d3266f80328fb6547b3462e81ab55b13f689d6b0944e242e2b3a0f3a32a3" From 1df21895f7fb55dfff4ba35354cc108e0b64538f Mon Sep 17 00:00:00 2001 From: jakub-nt <175944085+jakub-nt@users.noreply.github.com> Date: Fri, 22 Nov 2024 20:16:15 +0100 Subject: [PATCH 12/12] Explicitly fully sort VCF data for determinism, add argument to omit download, document `cfbs generate-release-information`, change git-checkout wording Signed-off-by: jakub-nt <175944085+jakub-nt@users.noreply.github.com> --- README.md | 1 + cfbs/args.py | 5 ++ cfbs/cfbs.1 | 8 +- cfbs/commands.py | 4 +- cfbs/main.py | 10 ++- cfbs/masterfiles/analyze.py | 77 +++++++++++++++++-- cfbs/masterfiles/download_all_versions.py | 8 +- .../generate_release_information.py | 17 ++-- cfbs/masterfiles/generate_vcf_git_checkout.py | 8 +- 9 files changed, 111 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index d42366cc..71030cff 100644 --- a/README.md +++ b/README.md @@ -287,6 +287,7 @@ These commands are intended to be run as part of build systems / deployment pipe `cfbs set-input` and `cfbs get-input` can be thought of as ways to save and load the input file. Similar to `cfbs get-input` the JSON contains both the specification (what the module accepts and how it's presented to the user) as well as the user's responses (if present). Expected usage is to run `cfbs get-input` to get the JSON, and then fill out the response part and run `cfbs set-input`. +* `cfbs generate-release-information`: An internal command used to generate JSON release information files from the [official CFEngine masterfiles](https://github.com/cfengine/masterfiles/). * `cfbs validate`: Used to validate the [index JSON file](https://github.com/cfengine/build-index/blob/master/cfbs.json). May be expanded to validate other files and formats in the future. **Note:** If you use `cfbs validate` as part of your automation, scripts, and build systems, be aware that we might add more strict validation rules in the future, so be prepared to sometimes have it fail after upgrading the version of cfbs. diff --git a/cfbs/args.py b/cfbs/args.py index bfd48960..1e608fae 100644 --- a/cfbs/args.py +++ b/cfbs/args.py @@ -100,6 +100,11 @@ def get_arg_parser(): help="Ignore versions.json. Necessary in case of a custom index or testing changes to the default index.", action="store_true", ) + parser.add_argument( + "--omit-download", + help="Use existing masterfiles instead of downloading in 'cfbs generate-release-information'", + action="store_true", + ) parser.add_argument( "--masterfiles", help="Add masterfiles on cfbs init choose between" ) diff --git a/cfbs/cfbs.1 b/cfbs/cfbs.1 index 049ef06e..1a0bca30 100644 --- a/cfbs/cfbs.1 +++ b/cfbs/cfbs.1 @@ -1,4 +1,4 @@ -.TH CFBS "1" "2024\-06\-07" "cfbs" "CFEngine Build System manual" +.TH CFBS "1" "2024\-11\-22" "cfbs" "CFEngine Build System manual" .SH NAME cfbs \- combines multiple modules into 1 policy set to deploy on your infrastructure. Modules can be custom promise types, JSON files which enable certain functionality, or reusable CFEngine policy. The modules you use can be written by the CFEngine team, others in the community, your colleagues, or yourself. .SH SYNOPSIS @@ -9,7 +9,7 @@ CFEngine Build System. .TP \fBcmd\fR -The command to perform (pretty, init, status, search, add, remove, clean, update, validate, download, build, install, help, info, show, input, set\-input, get\-input) +The command to perform (pretty, init, status, search, add, remove, clean, update, validate, download, build, install, help, info, show, input, set\-input, get\-input, generate\-release\-information) .TP \fBargs\fR @@ -72,6 +72,10 @@ Specify git commit message \fB\-\-ignore\-versions\-json\fR Ignore versions.json. Necessary in case of a custom index or testing changes to the default index. +.TP +\fB\-\-omit\-download\fR +Use existing masterfiles instead of downloading in 'cfbs generate-release-information' + .TP \fB\-\-masterfiles\fR \fI\,MASTERFILES\/\fR Add masterfiles on cfbs init choose between diff --git a/cfbs/commands.py b/cfbs/commands.py index 799766c2..324d793d 100644 --- a/cfbs/commands.py +++ b/cfbs/commands.py @@ -1208,5 +1208,5 @@ def get_input_command(name, outfile): @cfbs_command("generate-release-information") -def generate_release_information_command(): - generate_release_information() +def generate_release_information_command(omit_download=False): + generate_release_information(omit_download) diff --git a/cfbs/main.py b/cfbs/main.py index 3d36bdde..83bd1018 100644 --- a/cfbs/main.py +++ b/cfbs/main.py @@ -58,6 +58,12 @@ def main() -> int: % args.command ) + if args.omit_download and args.command != "generate-release-information": + user_error( + "The option --omit-download is only for 'cfbs generate-release-information', not 'cfbs %s'" + % args.command + ) + if args.non_interactive and args.command not in ( "init", "add", @@ -92,7 +98,9 @@ def main() -> int: return commands.info_command(args.args) if args.command == "generate-release-information": - return commands.generate_release_information_command() + return commands.generate_release_information_command( + omit_download=args.omit_download + ) if not is_cfbs_repo(): user_error("This is not a cfbs repo, to get started, type: cfbs init") diff --git a/cfbs/masterfiles/analyze.py b/cfbs/masterfiles/analyze.py index 5abe9be5..86867d98 100644 --- a/cfbs/masterfiles/analyze.py +++ b/cfbs/masterfiles/analyze.py @@ -1,3 +1,4 @@ +from collections import OrderedDict import os from cfbs.utils import dict_sorted_by_key, file_sha256 @@ -48,19 +49,79 @@ def versions_checksums_files( def finalize_vcf(versions_dict, checksums_dict, files_dict): + # explicitly sort VCF data to ensure determinism + + # checksums.json: + working_dict = checksums_dict["checksums"] + # sort each list, first by version descending, then by filepath alphabetically + for k in working_dict.keys(): + working_dict[k] = sorted( + working_dict[k], + key=lambda d: ( + version_as_comparable_list_negated(d["version"]), + d["file"], + ), + ) # sort checksums - sorted_checksums_dict = dict_sorted_by_key(checksums_dict["checksums"]) - checksums_dict["checksums"] = sorted_checksums_dict + checksums_dict["checksums"] = dict_sorted_by_key(working_dict) + # files.json: + working_dict = files_dict["files"] + # sort each list, first by version descending, then by checksum + for k in working_dict.keys(): + working_dict[k] = sorted( + working_dict[k], + key=lambda d: ( + version_as_comparable_list_negated(d["version"]), + d["checksum"], + ), + ) # sort files, alphabetically - sorted_files_dict = dict_sorted_by_key(files_dict["files"]) - files_dict["files"] = sorted_files_dict + files_dict["files"] = dict_sorted_by_key(working_dict) - # sort files of each version + # versions.json: working_dict = versions_dict["versions"] + # sort files of each version for k in working_dict.keys(): - sorted_dict = dict_sorted_by_key(working_dict[k]["files"]) - working_dict[k]["files"] = sorted_dict - versions_dict["versions"] = working_dict + working_dict[k]["files"] = dict_sorted_by_key(working_dict[k]["files"]) + # sort version numbers, in decreasing order + versions_dict["versions"] = OrderedDict( + sorted( + versions_dict["versions"].items(), + key=lambda p: (version_as_comparable_list(p[0]), p[1]), + reverse=True, + ) + ) return versions_dict, checksums_dict, files_dict + + +def version_as_comparable_list(version: str): + """Also supports versions containing exactly one of `b` or `-`. + + Example of the version ordering: `3.24.0b1 < 3.24.0 < 3.24.0-1`. + + Examples: + * `version_as_comparable_list("3.24.0b1")` is `[[3, 24, 0], [-1, 1]]` + * `version_as_comparable_list("3.24.0-2")` is `[[3, 24, 0], [1, 2]]` + * `version_as_comparable_list("3.24.x")` is `[[3, 24, 99999], [0, 0]]`""" + if "b" not in version: + if "-" not in version: + version += "|0.0" + version = version.replace("x", "99999").replace("-", "|1.").replace("b", "|-1.") + versionpair = version.split("|") + versionlist = [versionpair[0].split("."), versionpair[1].split(".")] + + versionlist[0] = [int(s) for s in versionlist[0]] + versionlist[1] = [int(s) for s in versionlist[1]] + + return versionlist + + +def version_as_comparable_list_negated(version): + vcl = version_as_comparable_list(version) + + vcl[0] = [-x for x in vcl[0]] + vcl[1] = [-x for x in vcl[1]] + + return vcl diff --git a/cfbs/masterfiles/download_all_versions.py b/cfbs/masterfiles/download_all_versions.py index 99ce4d4a..fad92cff 100644 --- a/cfbs/masterfiles/download_all_versions.py +++ b/cfbs/masterfiles/download_all_versions.py @@ -5,8 +5,6 @@ ENTERPRISE_RELEASES_URL = "https://cfengine.com/release-data/enterprise/releases.json" -DOWNLOAD_PATH = "downloaded_masterfiles" - def get_download_urls_enterprise(): download_urls = {} @@ -93,7 +91,7 @@ def download_versions_from_urls(download_path, download_urls, reported_checksums return downloaded_versions -def download_all_versions(): +def download_all_versions(download_path): download_urls, reported_checksums = get_download_urls_enterprise() # add masterfiles versions which do not appear in Enterprise releases but appear in Community releases @@ -111,7 +109,7 @@ def download_all_versions(): reported_checksums[version] = digest downloaded_versions = download_versions_from_urls( - DOWNLOAD_PATH, download_urls, reported_checksums + download_path, download_urls, reported_checksums ) - return DOWNLOAD_PATH, downloaded_versions + return downloaded_versions diff --git a/cfbs/masterfiles/generate_release_information.py b/cfbs/masterfiles/generate_release_information.py index 698403b7..d8eff440 100644 --- a/cfbs/masterfiles/generate_release_information.py +++ b/cfbs/masterfiles/generate_release_information.py @@ -2,17 +2,24 @@ from cfbs.masterfiles.generate_vcf_download import generate_vcf_download from cfbs.masterfiles.generate_vcf_git_checkout import generate_vcf_git_checkout from cfbs.masterfiles.check_download_matches_git import check_download_matches_git +from cfbs.utils import immediate_subdirectories +DOWNLOAD_PATH = "downloaded_masterfiles" -def generate_release_information(): - print("Downloading masterfiles...") - download_path, downloaded_versions = download_all_versions() +def generate_release_information(omit_download=False): + if not omit_download: + print("Downloading masterfiles...") + + downloaded_versions = download_all_versions(DOWNLOAD_PATH) + + print("Download finished. Every reported checksum matches.") + else: + downloaded_versions = immediate_subdirectories(DOWNLOAD_PATH) - print("Download finished. Every reported checksum matches.") print("Generating release information...") - generate_vcf_download(download_path, downloaded_versions) + generate_vcf_download(DOWNLOAD_PATH, downloaded_versions) generate_vcf_git_checkout(downloaded_versions) print("Candidate release information generated.") diff --git a/cfbs/masterfiles/generate_vcf_git_checkout.py b/cfbs/masterfiles/generate_vcf_git_checkout.py index eec53ec3..95a0e9c4 100644 --- a/cfbs/masterfiles/generate_vcf_git_checkout.py +++ b/cfbs/masterfiles/generate_vcf_git_checkout.py @@ -49,10 +49,10 @@ def generate_vcf_git_checkout(checkout_tags): versions_dict, checksums_dict, files_dict = initialize_vcf() for tag in checkout_tags: - print("Checkouting tag", tag) + print("Checking out tag", tag) - # checkouting some tags equal to the downloaded version doesn't result in the same files - # the downloadable files are reproducible by checkouting specific tags + # checking out some tags equal to the downloaded version doesn't result in the same files + # the downloadable files are reproducible by checking out specific tags if tag == "3.18.0": checkout_tag = "3.18.0-2" elif tag == "3.15.4": @@ -64,7 +64,7 @@ def generate_vcf_git_checkout(checkout_tags): else: checkout_tag = tag - # checkout the version + # check out the version subprocess.run( ["git", "checkout", checkout_tag], cwd=MPF_PATH,