From 037b15418c98fd1cd61d128f339caeb0ffd50986 Mon Sep 17 00:00:00 2001 From: Bradley Miller Date: Wed, 5 Nov 2025 15:09:21 -0800 Subject: [PATCH 1/3] New - manifest as html instead of xml --- components/rsptx/build_tools/core.py | 117 ++++++++++++++------------- 1 file changed, 60 insertions(+), 57 deletions(-) diff --git a/components/rsptx/build_tools/core.py b/components/rsptx/build_tools/core.py index b54db74b..175a8e79 100644 --- a/components/rsptx/build_tools/core.py +++ b/components/rsptx/build_tools/core.py @@ -36,6 +36,7 @@ from sqlalchemy.sql import text # todo: use our logger +import logging from rsptx.logging import rslogger from runestone.server import get_dburl from rsptx.db.models import Library, LibraryValidator @@ -43,7 +44,7 @@ from rsptx.response_helpers.core import canonical_utcnow import pdb -rslogger.setLevel("WARNING") +rslogger.setLevel(logging.DEBUG) # Local packages # -------------- @@ -328,12 +329,12 @@ def extract_docinfo(tree, string, attr=None, click=click): """ authstr = "" if string == "author": - el = tree.findall(f"./{string}") + el = tree.xpath(f"//{string}") for a in el: - authstr += ET.tostring(a, encoding="unicode", method="text").strip() + ", " + authstr += a.text.strip() + ", " authstr = authstr[:-2] return authstr - el = tree.find(f"./{string}") + el = tree.xpath(f".//{string}")[0] if attr is not None and el is not None: print(f"{el.attrib[attr]=}") return el.attrib[attr].strip() @@ -360,8 +361,10 @@ def update_library( # This is a bit of a hack for now... todo: continue to refactor these to use crud functions eng = create_engine(config.dburl.replace("+asyncpg", "")) if build_system == "PTX": - tree = ET.parse(mpath) - docinfo = tree.find("./library-metadata") + parser = ET.HTMLParser() + tree = ET.parse(mpath, parser) + docinfo_list = tree.xpath("//library-metadata") + docinfo = docinfo_list[0] if docinfo_list else None title = extract_docinfo(docinfo, "title") subtitle = extract_docinfo(docinfo, "subtitle") description = extract_docinfo(docinfo, "blurb") @@ -576,8 +579,10 @@ def _initialize_db_context(engine, sess, course_name, manifest_path): assignment_questions = Table("assignment_questions", meta, autoload_with=engine) # Get the author name from the manifest - tree = ET.parse(manifest_path) - docinfo = tree.find("./library-metadata") + parser = ET.HTMLParser() + tree = ET.parse(manifest_path, parser) + docinfo_list = tree.xpath("//library-metadata") + docinfo = docinfo_list[0] if docinfo_list else None author = extract_docinfo(docinfo, "author") res = sess.execute(book_author.select().where(book_author.c.book == course_name)) book_author_data = res.first() @@ -632,11 +637,12 @@ def _process_chapters(sess, db_context, course_name, manifest_path): """Process all chapters from the manifest.""" rslogger.info("Populating the database with Chapter information") - tree = ET.parse(manifest_path) + parser = ET.HTMLParser() + tree = ET.parse(manifest_path, parser) root = tree.getroot() chap = 0 - for chapter in root.findall("./chapter"): + for chapter in root.xpath("//chapter"): chap += 1 chapid = _process_single_chapter(sess, db_context, chapter, chap, course_name) _process_subchapters(sess, db_context, chapter, chapid, course_name) @@ -646,7 +652,8 @@ def _process_appendices(sess, db_context, course_name, manifest_path): """Process all appendices from the manifest.""" rslogger.info("Populating the database with Appendix information") - tree = ET.parse(manifest_path) + parser = ET.HTMLParser() + tree = ET.parse(manifest_path, parser) root = tree.getroot() for appendix in root.findall("./appendix"): @@ -659,20 +666,19 @@ def _process_appendices(sess, db_context, course_name, manifest_path): def _process_single_chapter(sess, db_context, chapter, chap_num, course_name): """Process a single chapter and return its database ID.""" - cnum = chapter.find("./number").text + cnum = chapter.xpath(".//number")[0].text if not cnum: cnum = "" - rslogger.debug( - f"{chapter.tag} {chapter.find('./id').text} {chapter.find('./title').text}" + rslogger.info( + f"{chapter.tag} {chapter.xpath('.//id')[0].text} {chapter.xpath('.//title')[0].text}" ) - ins = ( db_context["chapters"] .insert() .values( - chapter_name=f"{cnum} {chapter.find('./title').text}", + chapter_name=f"{cnum} {chapter.xpath('.//title')[0].text}", course_id=course_name, - chapter_label=chapter.find("./id").text, + chapter_label=chapter.xpath('//id')[0].text, chapter_num=chap_num, ) ) @@ -684,7 +690,7 @@ def _process_subchapters(sess, db_context, chapter, chapid, course_name): """Process all subchapters for a given chapter.""" subchap = 0 - for subchapter in chapter.findall("./subchapter"): + for subchapter in chapter.xpath(".//subchapter"): # check if this subchapter has a time-limit attribute if "data-time" in subchapter.attrib: _process_single_timed_assignment( @@ -694,7 +700,7 @@ def _process_subchapters(sess, db_context, chapter, chapid, course_name): # look for a subsubchapter with a time-limit attribute # at this point (7/28/2025) the only reason for a subsubchapter # is to have a timed assignment, so we can skip the rest of the - for subsubchapter in subchapter.findall("./subsubchapter"): + for subsubchapter in subchapter.xpath(".//subsubchapter"): if "data-time" in subsubchapter.attrib: _process_single_timed_assignment( sess, @@ -715,22 +721,20 @@ def _process_single_subchapter( sess, db_context, chapter, subchapter, chapid, subchap_num, course_name ): """Process a single subchapter and its contents.""" - scnum = subchapter.find("./number").text + scnum = subchapter.xpath(".//number")[0].text if not scnum: scnum = "" - chap_xmlid = subchapter.find("./id").text - rslogger.debug(f"subchapter {chap_xmlid}") + chap_xmlid = subchapter.xpath(".//id")[0].text + rslogger.info(f"subchapter {chap_xmlid}") if not chap_xmlid: rslogger.error(f"Missing id tag in subchapter {subchapter}") # Build subchapter title - titletext = subchapter.find("./title").text + titletext = subchapter.xpath(".//title")[0].text if not titletext: - rslogger.debug(f"constructing title for subchapter {chap_xmlid}") - titletext = " ".join( - [ET.tostring(y).decode("utf8") for y in subchapter.findall("./title/*")] - ) + rslogger.info(f"constructing title for subchapter {chap_xmlid}") + titletext = " ".join(subchapter.xpath(".//title")[0].itertext()) titletext = scnum + " " + titletext.strip() # Insert subchapter @@ -740,7 +744,7 @@ def _process_single_subchapter( .values( sub_chapter_name=titletext, chapter_id=chapid, - sub_chapter_label=subchapter.find("./id").text, + sub_chapter_label=subchapter.xpath(".//id")[0].text, skipreading="F", sub_chapter_num=subchap_num, ) @@ -853,10 +857,10 @@ def _process_single_timed_assignment( ): """Process a timed assignment subchapter.""" rslogger.info("Processing timed assignment subchapter") - titletext = subchapter.find("./title").text.strip() + titletext = subchapter.xpath(".//title")[0].text.strip() if not titletext: titletext = "Timed Assignment" - timed_id = subchapter.find("./id").text + timed_id = subchapter.xpath(".//id")[0].text time_limit = subchapter.attrib.get("data-time", "0") # no-result, no-feedback, no-pause show_feedback = "F" if subchapter.attrib.get("data-no-feedback", "") else "T" @@ -883,13 +887,12 @@ def _process_single_timed_assignment( # Now search for questions in this subchapter qnum = 0 - for question in subchapter.findall("./question"): + for question in subchapter.xpath(".//question"): qnum += 1 # Extract question content - dbtext = " ".join( - [ET.tostring(y).decode("utf8") for y in question.findall("./htmlsrc/*")] - ) - qlabel = " ".join([y.text for y in question.findall("./label")]) + htmlsrc = question.xpath(".//htmlsrc")[0] + dbtext = "".join(ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc) + qlabel = " ".join(question.xpath(".//label")[0].itertext()) # Get question element and metadata el, idchild, old_ww_id, qtype = _extract_question_metadata(question, dbtext) @@ -900,9 +903,9 @@ def _process_single_timed_assignment( # Build question data if parent is not None: - subchap_label = parent.find("./id").text + subchap_label = parent.xpath(".//id")[0].text else: - subchap_label = subchapter.find("./id").text + subchap_label = subchapter.xpath(".//id")[0].text valudict = dict( base_course=course_name, name=idchild, @@ -912,9 +915,9 @@ def _process_single_timed_assignment( htmlsrc=dbtext, autograde=_determine_autograde(dbtext), from_source="T", - chapter=chapter.find("./id").text, + chapter=chapter.xpath(".//id")[0].text, subchapter=subchap_label, - topic=f"{chapter.find('./id').text}/{subchapter.find('./id').text}", + topic=f"{chapter.xpath('.//id')[0].text}/{subchapter.xpath('.//id')[0].text}", qnumber=qlabel, optional="F", practice="F", @@ -931,7 +934,7 @@ def _process_single_timed_assignment( def _add_page_question(sess, db_context, chapter, subchapter, course_name): """Add a page entry to the questions table for this chapter/subchapter.""" - name = f"{chapter.find('./title').text}/{subchapter.find('./title').text}" + name = f"{chapter.xpath('.//title')[0].text}/{subchapter.xpath('.//title')[0].text}" res = sess.execute( text( @@ -946,8 +949,8 @@ def _add_page_question(sess, db_context, chapter, subchapter, course_name): timestamp=datetime.datetime.now(), is_private="F", question_type="page", - subchapter=subchapter.find("./id").text, - chapter=chapter.find("./id").text, + subchapter=subchapter.xpath(".//id")[0].text, + chapter=chapter.xpath(".//id")[0].text, from_source="T", author=db_context["author"], owner=db_context["owner"], @@ -973,7 +976,7 @@ def _add_page_question(sess, db_context, chapter, subchapter, course_name): def _process_questions(sess, db_context, chapter, subchapter, course_name): """Process all questions in a subchapter.""" - for question in subchapter.findall("./question"): + for question in subchapter.xpath(".//question"): _process_single_question( sess, db_context, chapter, subchapter, question, course_name ) @@ -984,14 +987,13 @@ def _process_single_question( ): """Process a single question element.""" # Extract question content - dbtext = " ".join( - [ET.tostring(y).decode("utf8") for y in question.findall("./htmlsrc/*")] - ) - qlabel = " ".join([y.text for y in question.findall("./label")]) + htmlsrc = question.xpath(".//htmlsrc")[0] + # + dbtext = "".join(ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc) + qlabel = " ".join(question.xpath(".//label")[0].itertext()) # Get question element and metadata el, idchild, old_ww_id, qtype = _extract_question_metadata(question, dbtext) - # Handle webwork case where we need to update dbtext if qtype == "webwork" and el is not None: dbtext = ET.tostring(el).decode("utf8") @@ -1005,8 +1007,8 @@ def _process_single_question( dbtext = _fix_image_urls(dbtext, db_context, course_name) # Build question data - sbc = subchapter.find("./id").text - cpt = chapter.find("./id").text + sbc = subchapter.xpath(".//id")[0].text + cpt = chapter.xpath(".//id")[0].text valudict = dict( base_course=course_name, name=idchild, @@ -1037,7 +1039,7 @@ def _process_single_question( def _extract_question_metadata(question, dbtext): """Extract metadata from a question element.""" - el = question.find(".//*[@data-component]") + el = question.xpath(".//*[@data-component]")[0] old_ww_id = None if el is not None: @@ -1045,7 +1047,7 @@ def _extract_question_metadata(question, dbtext): if "the-id-on-the-webwork" in el.attrib: old_ww_id = el.attrib["the-id-on-the-webwork"] else: - el = question.find("./div") + el = question.xpath(".//div")[0] if el is None: idchild = "fix_me" rslogger.error( @@ -1058,7 +1060,7 @@ def _extract_question_metadata(question, dbtext): try: qtype = el.attrib["data-component"] if qtype == "codelens": - id_el = el.find("./*[@class='pytutorVisualizer']") + id_el = el.xpath(".//*[@class='pytutorVisualizer']")[0] idchild = id_el.attrib["id"] qtype = QT_MAP.get(qtype, qtype) except Exception: @@ -1165,7 +1167,7 @@ def _handle_datafile(el, course_name): def _process_source_elements(sess, subchapter, course_name): """Process source elements in a subchapter.""" - for sourceEl in subchapter.findall("./source"): + for sourceEl in subchapter.xpath(".//source"): id = sourceEl.attrib["id"] file_contents = sourceEl.text filename = sourceEl.attrib.get("filename", sourceEl.attrib["id"]) @@ -1180,13 +1182,14 @@ def _process_source_elements(sess, subchapter, course_name): def _set_course_attributes(sess, db_context, course_name, manifest_path): """Set course attributes from the manifest.""" - tree = ET.parse(manifest_path) + parser = ET.HTMLParser() + tree = ET.parse(manifest_path, parser) root = tree.getroot() - latex = root.find("./latex-macros") + latex = root.xpath(".//latex-macros")[0] rslogger.info("Setting attributes for this base course") - ww_meta = root.find("./webwork-version") + ww_meta = root.xpath(".//webwork-version")[0] if ww_meta is not None: ww_major = ww_meta.attrib["major"] ww_minor = ww_meta.attrib["minor"] From 55a7d93e4e31dacda939a2d1700895f018c421a1 Mon Sep 17 00:00:00 2001 From: Bradley Miller Date: Wed, 5 Nov 2025 15:38:50 -0800 Subject: [PATCH 2/3] fix query to search from current node --- components/rsptx/build_tools/core.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/components/rsptx/build_tools/core.py b/components/rsptx/build_tools/core.py index 175a8e79..441875a9 100644 --- a/components/rsptx/build_tools/core.py +++ b/components/rsptx/build_tools/core.py @@ -153,7 +153,7 @@ def _build_ptx_book(config, gen, manifest, course, click=click, target="runeston if not os.path.exists("project.ptx"): click.echo("PreTeXt books need a project.ptx file") - return {"completed": False, "status": "Missing project.ptx file"} + return {"completed": False, "status": "Missing project.ptx file"} else: click.echo("Checking files") if not target: @@ -162,7 +162,7 @@ def _build_ptx_book(config, gen, manifest, course, click=click, target="runeston # and {"host-platform": "runestone"} in stringparams rs = check_project_ptx(click=click, course=course, target=target) if not rs: - return {"completed": False, "status": "Bad configuration in project.ptx"} + return {"completed": False, "status": "Bad configuration in project.ptx"} logger = logging.getLogger("ptxlogger") string_io_handler = StringIOHandler() @@ -197,7 +197,10 @@ def _build_ptx_book(config, gen, manifest, course, click=click, target="runeston res = copytree(rs.output_dir_abspath(), book_path, dirs_exist_ok=True) if not res: click.echo("Error copying files to published") - return {"completed": False, "status": "Error copying files to published"} + return { + "completed": False, + "status": "Error copying files to published", + } else: click.echo("No need to copy files to published") click.echo("Book deployed successfully") @@ -223,12 +226,10 @@ def _build_ptx_book(config, gen, manifest, course, click=click, target="runeston or "Traceback" in log_string or "compilation failed" in log_string ): - click.echo( - "Nonfatal errors in build, check the log for details" - ) + click.echo("Nonfatal errors in build, check the log for details") return {"completed": True, "status": "Nonfatal errors in build"} click.echo("Build completed successfully") - return {"completed": True, "status": "Build completed successfully"} + return {"completed": True, "status": "Build completed successfully"} # Support Functions @@ -678,7 +679,7 @@ def _process_single_chapter(sess, db_context, chapter, chap_num, course_name): .values( chapter_name=f"{cnum} {chapter.xpath('.//title')[0].text}", course_id=course_name, - chapter_label=chapter.xpath('//id')[0].text, + chapter_label=chapter.xpath(".//id")[0].text, chapter_num=chap_num, ) ) @@ -891,7 +892,9 @@ def _process_single_timed_assignment( qnum += 1 # Extract question content htmlsrc = question.xpath(".//htmlsrc")[0] - dbtext = "".join(ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc) + dbtext = "".join( + ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc + ) qlabel = " ".join(question.xpath(".//label")[0].itertext()) # Get question element and metadata @@ -989,7 +992,9 @@ def _process_single_question( # Extract question content htmlsrc = question.xpath(".//htmlsrc")[0] # - dbtext = "".join(ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc) + dbtext = "".join( + ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc + ) qlabel = " ".join(question.xpath(".//label")[0].itertext()) # Get question element and metadata From e0c1d37885ee7698f420a09282144c2036945897 Mon Sep 17 00:00:00 2001 From: Bradley Miller Date: Thu, 6 Nov 2025 07:47:31 -0800 Subject: [PATCH 3/3] Fix encoding issues --- components/rsptx/build_tools/core.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/components/rsptx/build_tools/core.py b/components/rsptx/build_tools/core.py index 441875a9..06424985 100644 --- a/components/rsptx/build_tools/core.py +++ b/components/rsptx/build_tools/core.py @@ -362,7 +362,7 @@ def update_library( # This is a bit of a hack for now... todo: continue to refactor these to use crud functions eng = create_engine(config.dburl.replace("+asyncpg", "")) if build_system == "PTX": - parser = ET.HTMLParser() + parser = ET.HTMLParser(encoding="utf-8") tree = ET.parse(mpath, parser) docinfo_list = tree.xpath("//library-metadata") docinfo = docinfo_list[0] if docinfo_list else None @@ -580,7 +580,7 @@ def _initialize_db_context(engine, sess, course_name, manifest_path): assignment_questions = Table("assignment_questions", meta, autoload_with=engine) # Get the author name from the manifest - parser = ET.HTMLParser() + parser = ET.HTMLParser(encoding="utf-8") tree = ET.parse(manifest_path, parser) docinfo_list = tree.xpath("//library-metadata") docinfo = docinfo_list[0] if docinfo_list else None @@ -638,7 +638,7 @@ def _process_chapters(sess, db_context, course_name, manifest_path): """Process all chapters from the manifest.""" rslogger.info("Populating the database with Chapter information") - parser = ET.HTMLParser() + parser = ET.HTMLParser(encoding="utf-8") tree = ET.parse(manifest_path, parser) root = tree.getroot() chap = 0 @@ -653,7 +653,7 @@ def _process_appendices(sess, db_context, course_name, manifest_path): """Process all appendices from the manifest.""" rslogger.info("Populating the database with Appendix information") - parser = ET.HTMLParser() + parser = ET.HTMLParser(encoding="utf-8") tree = ET.parse(manifest_path, parser) root = tree.getroot() @@ -893,7 +893,7 @@ def _process_single_timed_assignment( # Extract question content htmlsrc = question.xpath(".//htmlsrc")[0] dbtext = "".join( - ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc + ET.tostring(child, encoding="utf-8", method="html").decode("utf-8") for child in htmlsrc ) qlabel = " ".join(question.xpath(".//label")[0].itertext()) @@ -993,10 +993,10 @@ def _process_single_question( htmlsrc = question.xpath(".//htmlsrc")[0] # dbtext = "".join( - ET.tostring(child, encoding="unicode", method="html") for child in htmlsrc + ET.tostring(child, encoding="utf-8", method="html").decode("utf-8") for child in htmlsrc ) qlabel = " ".join(question.xpath(".//label")[0].itertext()) - + print(f"dbtext = {dbtext}") # Get question element and metadata el, idchild, old_ww_id, qtype = _extract_question_metadata(question, dbtext) # Handle webwork case where we need to update dbtext @@ -1187,7 +1187,7 @@ def _process_source_elements(sess, subchapter, course_name): def _set_course_attributes(sess, db_context, course_name, manifest_path): """Set course attributes from the manifest.""" - parser = ET.HTMLParser() + parser = ET.HTMLParser(encoding="utf-8") tree = ET.parse(manifest_path, parser) root = tree.getroot()