From d08fddeb2713286b208115b8ca601906aa1dd092 Mon Sep 17 00:00:00 2001 From: Aliasghar Jawadwala Date: Mon, 26 Jan 2026 16:57:35 +0530 Subject: [PATCH] Add YAML frontmatter syntax validation for license data --- src/licensedcode/frontmatter.py | 2 +- src/licensedcode/models.py | 4 +-- tests/licensedcode/test_license_models.py | 32 +++++++++++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/licensedcode/frontmatter.py b/src/licensedcode/frontmatter.py index 2e3f6ee9971..05c60d12ed3 100644 --- a/src/licensedcode/frontmatter.py +++ b/src/licensedcode/frontmatter.py @@ -135,7 +135,7 @@ def load_frontmatter(fd, encoding="utf-8", **defaults): text = fd.read() else: - with codecs.open(fd, "r", encoding) as f: + with open(fd, "r", encoding=encoding) as f: text = f.read() text = return_unicode(text, encoding) diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 354d93f52d3..5b84230cd0c 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -498,7 +498,7 @@ def dump(self, licenses_data_dir): content = get_yaml_safe_text(content) output = dumps_frontmatter(content=content, metadata=metadata) license_file = self.license_file(licenses_data_dir=licenses_data_dir) - with open(license_file, 'w') as of: + with open(license_file, 'w', encoding='utf-8') as of: of.write(output) def load(self, license_file, check_consistency=True): @@ -2418,7 +2418,7 @@ def dump(self, rules_data_dir, **kwargs): metadata.update(kwargs) content = self.text output = dumps_frontmatter(content=content, metadata=metadata) - with open(rule_file, 'w') as of: + with open(rule_file, 'w', encoding='utf-8') as of: of.write(output) def load(self, rule_file, with_checks=True): diff --git a/tests/licensedcode/test_license_models.py b/tests/licensedcode/test_license_models.py index 6c47d92a594..b407a194532 100644 --- a/tests/licensedcode/test_license_models.py +++ b/tests/licensedcode/test_license_models.py @@ -660,3 +660,35 @@ def test_get_key_phrases_ignores_nested_key_phrase_markup(self): raise Exception('Exception should be raised') except InvalidRuleRequiredPhrase: pass + + +class TestLicenseYamlFrontmatterSyntax(FileBasedTesting): + """ + Validate that all license data files have valid YAML syntax. + See: https://github.com/aboutcode-org/scancode-toolkit/issues/3947 + """ + test_data_dir = TEST_DATA_DIR + + def test_license_yaml_frontmatter_integrity(self): + """ + Ensure all .LICENSE files in licenses_data_dir have valid YAML syntax + in their frontmatter section. + """ + from pathlib import Path + from licensedcode.frontmatter import load_frontmatter + from licensedcode.models import licenses_data_dir + + licenses_path = Path(licenses_data_dir) + errors = [] + + for license_file in sorted(licenses_path.glob('*.LICENSE')): + try: + load_frontmatter(str(license_file)) + except Exception as e: + errors.append(f'{license_file.name}: {e}') + + if errors: + error_msg = '\n'.join(errors[:20]) # Show first 20 errors + if len(errors) > 20: + error_msg += f'\n... and {len(errors) - 20} more errors' + assert False, f'Invalid YAML in {len(errors)} license files:\n{error_msg}'