added tests for two functions and fixed the bugs that were found

notactuallyfinn · notactuallyfinn · commit 6a17973d6adf · 2025-08-20T11:11:13.000+02:00
diff --git a/src/hermes_toml/harvest.py b/src/hermes_toml/harvest.py
@@ -354,6 +354,9 @@ def handle_pypi_classifieres(cls, classifiers: str | list[str], data):
         if isinstance(classifiers, str):
             classifiers = [classifiers]
 
+        # remove duplicates
+        classifiers = list(set(classifiers))
+
         sorted_classifiers = {
             "schema:targetProduct": [], "schema:audience": [], "schema:license": [],
             "schema:inLanguage": [], "schema:programming Language": [], "schema:about": []
@@ -365,32 +368,33 @@ def handle_pypi_classifieres(cls, classifiers: str | list[str], data):
             classifier = classifier.split(" :: ")
             if len(classifier) < 2:
                 continue
-            if classifier[0] == "Operating System":
-                temp = {"@type": "SoftwareApplication", "name": classifier[-1]}
+            if (classifier[0] == "Operating System" and
+                not (len(classifier) == 2 and classifier[1] == "Microsoft")):
+                temp = {"@type": "schema:SoftwareApplication", "schema:name": classifier[-1]}
                 sorted_classifiers["schema:targetProduct"].append(temp)
             elif classifier[0] == "Intended Audience":
-                temp = {"@type": "Audience", "name": classifier[-1]}
+                temp = {"@type": "schema:Audience", "schema:name": classifier[-1]}
                 sorted_classifiers["schema:audience"].append(temp)
             elif (classifier[0] == "License" and
                   not (classifier[1] == "OSI Approved" and len(classifier) == 2)):
-                temp = {"@type": "CreativeWork", "name": classifier[-1]}
+                temp = {"@type": "schema:CreativeWork", "schema:name": classifier[-1]}
                 sorted_classifiers["schema:license"].append(temp)
             elif classifier[0] == "Natural Language":
                 sorted_classifiers["schema:inLanguage"].append(classifier[-1])
             elif classifier[0] == "Programming Language":
                 if classifier[1] == "Python" and len(classifier) > 2:
-                    if classifier[2].isdecimal():
-                        temp = f"Python {classifier[2]}"
-                    elif classifier[2] == "Free Threading":
+                    if classifier[2] == "Free Threading":
                         temp = "Python Free Threading" \
                                f"{f' {classifier[3]}' if len(classifier) > 3 else ''}"
                     elif classifier[2] == "Implementation":
                         temp = classifier[3] if len(classifier) > 3 else "Python Implementation"
+                    else:
+                        temp = f"Python {classifier[2]}"
                     sorted_classifiers["schema:programming Language"].append(temp)
                 else:
                     sorted_classifiers["schema:programming Language"].append(classifier[-1])
             elif classifier[0] == "Topic":
-                temp = {"@type": "Thing", "name": " ".join(classifier[1:])}
+                temp = {"@type": "schema:Thing", "schema:name": " ".join(classifier[1:])}
                 sorted_classifiers["schema:about"].append(temp)
 
         # add everything to the SoftwareMetadata object
@@ -432,7 +436,7 @@ def handle_urls(cls, urls: dict[str, str], data):
         # iterate over the dictionaries items and add the url to the correct bucket
         # if the key hints it to be the right one
         for name, url in urls.items():
-            if not (isinstance(name, str) and isinstance(url, str)):
+            if (not (isinstance(name, str) and isinstance(url, str))) or url == "":
                 continue
             name = name.lower()
             if name.find("code") != -1 or name.find("repository") != -1:
@@ -450,6 +454,7 @@ def handle_urls(cls, urls: dict[str, str], data):
 
         # add everything to the SoftwareMetadata object
         for key, value in sorted_urls.items():
+            value = list(set(value))
             if len(value) > 1:
                 data[key] = value
             elif len(value) == 1:
diff --git a/test/hermes_toml_test/test_harvest.py b/test/hermes_toml_test/test_harvest.py
@@ -7,64 +7,79 @@
 
 import pytest
 import toml
+from pytest_unordered import unordered
 from hermes_toml.harvest import TomlHarvestPlugin
 
 @pytest.mark.parametrize("in_data, out_data", [
-    ({"givenName": "Tom"}, {"givenName": "Tom"}), ({"a": "b"}, {}),
-    ({"givenName": "Tom","a": "b"}, {"givenName": "Tom"}), ({}, {})
+    ({}, {}), (None, {}), ("", {}), (1, {}), ([], {}), ({1:""}, {}), ({None:""}, {}),
+    ({1:"", None:""}, {}), ({"a":[]}, {}), ({"a":None}, {}), ({"a":1}, {}), ({"a":{}}, {}),
+    ({"a":1, "b":None}, {}), ({"a":""}, {}), ({"a":"b"}, {"relatedLink":"b"}),
+    ({"a":"b", "b":None}, {"relatedLink":"b"}), ({"code": "a"}, {"schema:codeRepository": "a"}),
+    ({"a":"b", "b":"c"}, {"relatedLink":unordered(["b", "c"])}),
+    ({"codeRepository": "a"}, {"schema:codeRepository": "a"}),
+    ({"repository": "a"}, {"schema:codeRepository": "a"}),
+    ({"code": "a", "repository":"b"}, {"schema:codeRepository": unordered(["a", "b"])}),
+    ({"code": "a", "repository":"a"}, {"schema:codeRepository": "a"}),
+    ({"buildInstructions": "a"}, {"buildInstructions": "a"}),
+    ({"IssueTracker": "a"}, {"IssueTracker": "a"}), ({"readme": "a"}, {"readme": "a"}),
+    ({"discussion": "a"}, {"schema:discussionURL": "a"}),
+    ({"readme":"a","code":"b","homepage":"c"},
+     {"readme":"a","schema:codeRepository":"b","relatedLink":"c"}),
+    ({"readme":"a","code":"b","homepage":"c", "mymistake":"c"},
+     {"readme":"a","schema:codeRepository":"b","relatedLink":"c"}),
+    ({"readme":"a","code":"b","homepage":"c", "mypage":"d"},
+     {"readme":"a","schema:codeRepository":"b","relatedLink":unordered(["c", "d"])})
 ])
-def test_remove_forbidden_keys(in_data, out_data):
-    assert TomlHarvestPlugin.remove_forbidden_keys(in_data) == out_data
+def test_handle_urls(in_data, out_data):
+    data = {}
+    TomlHarvestPlugin.handle_urls(in_data, data)
+    assert data == out_data
 
 @pytest.mark.parametrize("in_data, out_data", [
-    ({"givenName": "Tom"}, {"givenName": "Tom"}), ({"a": "b"}, {}),
-    ({"givenName": "Tom","a": "b"}, {"givenName": "Tom"}), ({}, {}),
-    ([{"givenName": "Tom"}], [{"givenName": "Tom"}]),
-    ([{"givenName": "Tom"}, {"a": "b"}], [{"givenName": "Tom"}]),
-    ([{}, {"givenName": "Tom"}, {"a": "b"}], [{"givenName": "Tom"}]),
-    ([{}], []), ([{"b":"c"}], []), ([], [])
+    (1, {}), ({}, {}), ("", {}), ([], {}), ([""], {}), (["", ""], {}), ([1], {}),
+    ("Development Status :: xxx", {}), ("Environment :: xxx", {}), ("Framework :: xxx", {}),
+    ("Intended Audience :: xxx", {"schema:audience": {"@type": "schema:Audience", "schema:name": "xxx"}}),
+    ("License :: xxx", {"schema:license": {"@type": "schema:CreativeWork", "schema:name": "xxx"}}),
+    ("License :: OSI Approved", {}), ("Operating System :: Microsoft", {}),
+    ("License :: OSI Approved :: xxx", {"schema:license": {"@type": "schema:CreativeWork", "schema:name": "xxx"}}),
+    ("Natural Language :: xxx", {"schema:inLanguage": "xxx"}),
+    ("Operating System :: xxx", {"schema:targetProduct": {"@type": "schema:SoftwareApplication", "schema:name": "xxx"}}),
+    ("Operating System :: x :: xxx", {"schema:targetProduct": {"@type": "schema:SoftwareApplication", "schema:name": "xxx"}}),
+    ("Operating System :: x :: x :: xxx", {"schema:targetProduct": {"@type": "schema:SoftwareApplication", "schema:name": "xxx"}}),
+    ("Operating System :: Microsoft :: xxx", {"schema:targetProduct": {"@type": "schema:SoftwareApplication", "schema:name": "xxx"}}),
+    ("Programming Language :: xxx", {"schema:programming Language": "xxx"}),
+    (["Programming Language :: xxx", 1], {"schema:programming Language": "xxx"}),
+    ("Programming Language :: Python :: xxx", {"schema:programming Language": "Python xxx"}),
+    ("Programming Language :: Python :: x :: only", {"schema:programming Language": "Python x"}),
+    ("Programming Language :: Python :: Free Threading :: xxx", {"schema:programming Language": "Python Free Threading xxx"}),
+    ("Programming Language :: Python :: Implementation :: x", {"schema:programming Language": "x"}),
+    ("Topic :: a", {"schema:about": {"@type": "schema:Thing", "schema:name": "a"}}),
+    ("Topic :: a :: b", {"schema:about": {"@type": "schema:Thing", "schema:name": "a b"}}),
+    ("Topic :: a :: b :: c", {"schema:about": {"@type": "schema:Thing", "schema:name": "a b c"}}),
+    ("Topic :: a :: b :: c :: d", {"schema:about": {"@type": "schema:Thing", "schema:name": "a b c d"}}),
+    (["Natural Language :: xxx", "Natural Language :: xxx"], {"schema:inLanguage": "xxx"}),
+    (["Natural Language :: xxx", "Natural Language :: yyy"], {"schema:inLanguage": unordered(["xxx", "yyy"])}),
+    (["Natural Language :: xxx", "Programming Language :: xxx"], {"schema:inLanguage": "xxx", "schema:programming Language": "xxx"}),
+    (["Natural Language :: xxx", "Programming Language :: xxx", "Programming Language :: xxx"],
+     {"schema:inLanguage": "xxx", "schema:programming Language": "xxx"}),
+    (["Natural Language :: xxx", "Programming Language :: xxx", "Programming Language :: yyy"],
+     {"schema:inLanguage": "xxx", "schema:programming Language": unordered(["xxx", "yyy"])}),
+    (["Topic :: a", "Topic :: b"], {"schema:about": unordered([{"@type": "schema:Thing", "schema:name": "a"}, {"@type": "schema:Thing", "schema:name": "b"}])}),
 ])
-def test_handle_person(in_data, out_data):
-    assert TomlHarvestPlugin.handle_person_in_unknown_format(in_data) == out_data
+def test_handle_pypi_classifiers(in_data, out_data):
+    data = {}
+    TomlHarvestPlugin.handle_pypi_classifieres(in_data, data)
+    assert data == out_data
 
-@pytest.mark.parametrize("in_data", [
-    (15), ([{}, (15)]), (None)
-])
-def test_handle_person_with_error(in_data):
-    with pytest.raises(ValueError):
-        TomlHarvestPlugin.handle_person_in_unknown_format(in_data)
+"""
+Test-Cases:
+korrekte Eingabe:
+{"project": {"keywords": ["A", "B", "C"], "classifiers": ["Development Status :: 1 - Planning", "Environment :: Console"], "readme": {"text": "Test", content-type: "text/markdown"},
+             "requires-python": ">=3.12", "name": "Test", "authors": [{"name": "Testi", "email": "testi@domain.de"}, {"name": "Test", "email": "test@otherdomain.com"}],
+             "maintainers": [{"name": "Tester", "email": "tester@domain.com"}, {"name": "Testers", "email": "testers@otherdomain.de"}], "dependencies": ["scipy>=1.0", "numpy~=1.5"],
+             "license": "MIT AND (Apache-2.0 OR BSD-2-Clause)", "urls": ["homepage": "mypage.org", "documentation": "mydocumentation.com"], "version": "1.5.2"
+            }
+}
 
-@pytest.fixture(scope="session")
-def toml_file(tmp_path_factory):
-    fn = tmp_path_factory.mktemp("data") / "test.toml"
-    return fn
 
-@pytest.mark.parametrize("in_data, out_data", [
-    ({}, {}), ({"project": {"name":"a"}}, {"name": "a"}),
-    ({"tool": {"poetry": {"name":"a"}}}, {"name": "a"}),
-    ({"project":{"name":"a"}, "a":{"b":"c"}}, {"name":"a"}),
-    ({"project":{"name":"a", "requires-python":">3.7"}}, {"name":"a", "runtimePlatform":"Python >3.7"}),
-    ({"project":{"authors":{"givenName":"a"}}}, {"author":{"givenName":"a", "@type":"Person"}}),
-    ({"project":{"authors":[{"givenName":"a"}, {"givenName":"a"}]}}, {"author":[{"givenName":"a", "@type":"Person"}, {"givenName":"a", "@type":"Person"}]}),
-    ({"project":{"authors":{"givenName":"a", "a":"b"}}}, {"author":{"givenName":"a", "@type":"Person"}}),
-    ({"project":{"authors":{"a":"b"}}}, {}),
-    ({"project":{"authors":[{"a":"a"}, {"givenName":"a"}]}}, {"author":{"givenName":"a", "@type":"Person"}}),
-    ({"project":{"authors":[{"a":"b"}]}}, {}),
-    ({"project":{"authors":["abc def<abc.def@dlr.com>"]}}, {"author":{"name":"abc def", "email": "abc.def@dlr.com", "@type": "Person"}}),
-    ({"project":{"authors":"abc def<abc.def@dlr.com>"}}, {"author":{"name":"abc def", "email": "abc.def@dlr.com", "@type": "Person"}})
-])
-def test_read_from_toml(in_data, out_data, toml_file):
-    toml.dump(in_data, open(toml_file, "w", encoding="utf8"))
-    assert TomlHarvestPlugin.read_from_toml(str(toml_file)) == out_data
-
-@pytest.mark.parametrize("in_data", [
-    ({"project": {"authors":1}}), ({"tool": {"poetry": {"authors":1}}}),
-    ({"project": {"authors":[1]}}), ({"tool": {"poetry": {"authors":[1]}}}),
-    ({"project": {"name":"a"}, "tool": {"poetry": {"name":"a"}}}),
-    ({"project": {"authors":["as<as>as"]}}),
-    ({"project": {"authors":"as<as>as"}})
-])
-def test_read_from_toml_with_error(in_data, toml_file):
-    toml.dump(in_data, open(toml_file, "w", encoding="utf8"))
-    with pytest.raises(ValueError):
-        TomlHarvestPlugin.read_from_toml(str(toml_file))
+"""