Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 121 additions & 154 deletions web/management/commands/validatelanginfofiles.py
Original file line number Diff line number Diff line change
@@ -1,165 +1,132 @@
import os
import json
from pathlib import Path

from django.core.management.base import BaseCommand, CommandError

from web.models import MetaInfo


class Command(BaseCommand):
help = "Reads all language JSON files to ensure they're constructed correctly"

def handle(self, *args, **options):
error_count = 0
warning_count = 0
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.error_count = 0
self.warning_count = 0
self.thesauruses_path = Path("web/thesauruses")

# Open up thesaurus directory
language_dirs = os.listdir("web/thesauruses/")
for lang_dir in language_dirs:
if lang_dir == "_meta":
continue
if os.path.isfile("web/thesauruses/" + lang_dir):
def handle(self, *args, **options):
for lang_dir in self.thesauruses_path.iterdir():
if not lang_dir.is_dir() or lang_dir.name == "_meta":
continue

# Open up each version directory
versions = os.listdir("web/thesauruses/" + lang_dir)
for version in versions:
# Open up each structures file
structure_files = os.listdir("web/thesauruses/" + lang_dir + "/" + version)
for structure_file in structure_files:
structure = structure_file[:-5]

# Ensure valid lang/version/name
meta_structure_file_path = os.path.join(
"web", "thesauruses", lang_dir, version, structure) + ".json"

# parse file
with open(meta_structure_file_path, 'r') as meta_structure_file:
raw_file_data = meta_structure_file.read()

meta_structure_file_json = json.loads(raw_file_data)

language = meta_structure_file_json["meta"]["language"]
language_version = meta_structure_file_json["meta"]["language_version"]
language_name = meta_structure_file_json["meta"]["language_name"]
relative_path_name = lang_dir + "/" + version + "/" + structure + ".json"

if not language:
print(
f"[Error] `{relative_path_name}` has an empty `language` attribute and needs to be updated")
error_count += 1
elif language == "language_id":
print(
f"[Error] `{relative_path_name}` has the default `language` attribute and needs to be updated")
error_count += 1
elif not language == lang_dir:
print(
f"[Error] `{relative_path_name}` has a `language` attribute that should be `{lang_dir}` and needs to be updated")
error_count += 1

if not language_version:
print(
f"[Error] `{relative_path_name}` has an empty `language_version` attribute and needs to be updated")
error_count += 1
elif language_version == "version.number":
print(
f"[Error] `{relative_path_name}` has the default `language_version` attribute and needs to be updated")
error_count += 1

if not language_name:
print(
f"[Error] `{relative_path_name}` has an empty `language_name` attribute and needs to be updated")
error_count += 1
elif language_name == "Human-Friendly Language Name" or language_name == "Human-Readable Language Name":
print(
f"[Error] `{relative_path_name}` has the default `language_name` attribute and needs to be updated")
error_count += 1

# Ensure categories aren't in file
if "categories" in meta_structure_file_json:
print(
f"[Error] `{relative_path_name}` has a `categories` section in it, which is now deprecated")
error_count += 1

# Ensure name lines are removed
for item in meta_structure_file_json["concepts"]:
structure_item_data = meta_structure_file_json["concepts"][item]

# This generates SO many warnings that I'm commenting it out for now. Consider uncommenting
# when more errors and such have been resolved
# if "name" in structure_item_data:
# print(f"[Warn] `{relative_path_name}`, ID: `{item}` has a `name` line that can be "
# f"removed")
# warning_count += 1

# Ensure there's either code or not-implemented
has_code = "code" in structure_item_data
has_not_implemented = "not-implemented" in structure_item_data
has_not_underscore_implemented = "not_implemented" in structure_item_data
has_comments_plural = "comments" in structure_item_data

# Ensure they use not-implemented (hyphen) not not_implemented (underscore)
if has_not_underscore_implemented:
print(f"[Error] `{relative_path_name}`, ID: `{item}` has not_implemented (underscore) "
f"when it should use not-implemented (hyphen)")
error_count += 1

if has_code and (has_not_implemented or has_not_underscore_implemented):
print(
f"[Error] `{relative_path_name}`, ID: `{item}` should have `code` or "
f"`not-implemented`, not both")
error_count += 1

if not has_code and not has_not_implemented and not has_not_underscore_implemented:
print(
f"[Error] `{relative_path_name}`, ID: `{item}` is missing a needed `code` or "
f"`not-implemented` line")
error_count += 1

# Ensure if not-implemented, there's no code line
if has_not_implemented and structure_item_data["not-implemented"] is True and has_code:
print(f"[Error] `{relative_path_name}`, ID: `{item}` is not implemented, but has a "
f"`code` line that should be removed")
error_count += 1

# Ensure if code, it's not empty and there's no not-implemented
if has_code and not structure_item_data["code"] and not has_not_implemented:
print(f"[Error] `{relative_path_name}`, ID: `{item}` is confusing: `code` is empty "
f"but there's no `not-implemented` either")
error_count += 1

# Ensure it's comment, not comments
if has_comments_plural:
print(f"[Error] `{relative_path_name}`, ID: `{item}` has `comments` (plural) that "
f"should be `comment` (singular) instead")
error_count += 1

# Code can be string or array (maybe warn if string)
# if has_code and isinstance(structure_item_data["code"], str):
# print(f"[Warning] `{relative_path_name}`, ID: `{item}` has a `code` line that's a "
# f"string and could optionally be an array")
# warning_count += 1

# There shouldn't be any other fields
for key in structure_item_data:
if not (key == "code"
or key == "comment"
or key == "comments"
or key == "not-implemented"
or key == "not_implemented"
or key == "name"):
# Why "not_implemented"/"name"/"comments"? Because we check for them above,
# this checks for other exceptions
print(f"[Warning] `{relative_path_name}`, ID: `{item}` has a line `{key}` that's "
f"unknown")
warning_count += 1

if warning_count + error_count > 0:
# if error_count > 0:
if warning_count:
print(str(warning_count) + " warnings found.")
if error_count:
print(str(error_count) + " errors found.")
raise CommandError(str(error_count) + " errors found.")
else:
print("No issues found.")
for version_dir in lang_dir.iterdir():
if not version_dir.is_dir():
continue

for structure_file in version_dir.glob("*.json"):
self.validate_language_file(structure_file)

if self.warning_count > 0:
self.stdout.write(self.style.WARNING(f"{self.warning_count} warnings found."))

if self.error_count > 0:
self.stdout.write(self.style.ERROR(f"{self.error_count} errors found."))
raise CommandError(f"{self.error_count} errors found.")

if self.error_count == 0 and self.warning_count == 0:
self.stdout.write(self.style.SUCCESS("No issues found."))

def report_error(self, message):
self.stderr.write(self.style.ERROR(f"[Error] {message}"))
self.error_count += 1

def report_warning(self, message):
self.stdout.write(self.style.WARNING(f"[Warning] {message}"))
self.warning_count += 1

def validate_language_file(self, file_path):
relative_path = file_path.relative_to(self.thesauruses_path)

try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except (json.JSONDecodeError, UnicodeDecodeError) as e:
self.report_error(f"Failed to parse `{relative_path}`: {e}")
return

self.check_meta_section(data, relative_path)
self.check_concepts(data, relative_path)

def check_meta_section(self, data, relative_path):
meta = data.get("meta", {})
# relative_path is something like "python/3/data_types.json"
# parts[0] is the language directory name
lang_dir = relative_path.parts[0]

language = meta.get("language")
language_version = meta.get("language_version")
language_name = meta.get("language_name")

if not language:
self.report_error(f"`{relative_path}` has an empty `language` attribute and needs to be updated")
elif language == "language_id":
self.report_error(f"`{relative_path}` has the default `language` attribute and needs to be updated")
elif language != lang_dir:
self.report_error(f"`{relative_path}` has a `language` attribute that should be `{lang_dir}` and needs to be updated")

if not language_version:
self.report_error(f"`{relative_path}` has an empty `language_version` attribute and needs to be updated")
elif language_version == "version.number":
self.report_error(f"`{relative_path}` has the default `language_version` attribute and needs to be updated")

if not language_name:
self.report_error(f"`{relative_path}` has an empty `language_name` attribute and needs to be updated")
elif language_name in ["Human-Friendly Language Name", "Human-Readable Language Name"]:
self.report_error(f"`{relative_path}` has the default `language_name` attribute and needs to be updated")

if "categories" in data:
self.report_error(f"`{relative_path}` has a `categories` section in it, which is now deprecated")

def check_concepts(self, data, relative_path):
concepts = data.get("concepts", {})
for concept_id, item_data in concepts.items():
has_code = "code" in item_data
has_not_implemented = "not-implemented" in item_data
has_not_underscore_implemented = "not_implemented" in item_data
has_comments_plural = "comments" in item_data

if has_not_underscore_implemented:
self.report_error(
f"`{relative_path}`, ID: `{concept_id}` has not_implemented (underscore) "
"when it should use not-implemented (hyphen)"
)

if has_code and (has_not_implemented or has_not_underscore_implemented):
self.report_error(
f"`{relative_path}`, ID: `{concept_id}` should have `code` or `not-implemented`, not both"
)

if not has_code and not has_not_implemented and not has_not_underscore_implemented:
self.report_error(
f"`{relative_path}`, ID: `{concept_id}` is missing a needed `code` or `not-implemented` line"
)

if has_not_implemented and item_data.get("not-implemented") is True and has_code:
self.report_error(
f"`{relative_path}`, ID: `{concept_id}` is not implemented, but has a `code` line that should be removed"
)

if has_code and not item_data.get("code") and not has_not_implemented:
self.report_error(
f"`{relative_path}`, ID: `{concept_id}` is confusing: `code` is empty but there's no `not-implemented` either"
)

if has_comments_plural:
self.report_error(
f"`{relative_path}`, ID: `{concept_id}` has `comments` (plural) that should be `comment` (singular) instead"
)

# Check for unknown keys
allowed_keys = {"code", "comment", "not-implemented", "not_implemented", "name", "comments"}
for key in item_data:
if key not in allowed_keys:
self.report_warning(f"`{relative_path}`, ID: `{concept_id}` has a line `{key}` that's unknown")
Loading
Loading