From 0f158bad211c2e0b812fa0a8ca9af35f85f90d23 Mon Sep 17 00:00:00 2001 From: MaxKing <1512347620@qq.com> Date: Sun, 22 Mar 2026 18:17:11 +0800 Subject: [PATCH 1/3] Using libxml2 source code --- .gitmodules | 4 + Cargo.toml | 56 +++++----- build.rs | 171 ------------------------------ libxml-sys/Cargo.toml | 28 +++++ libxml-sys/README.md | 1 + libxml-sys/build.rs | 40 +++++++ libxml-sys/libxml2 | 1 + libxml-sys/src/lib.rs | 15 +++ {src => libxml-sys/src}/wrapper.h | 10 +- src/bindings.rs | 8 +- src/c_helpers.rs | 10 -- src/schemas/common.rs | 11 -- 12 files changed, 122 insertions(+), 233 deletions(-) create mode 100644 .gitmodules delete mode 100644 build.rs create mode 100644 libxml-sys/Cargo.toml create mode 100644 libxml-sys/README.md create mode 100644 libxml-sys/build.rs create mode 160000 libxml-sys/libxml2 create mode 100644 libxml-sys/src/lib.rs rename {src => libxml-sys/src}/wrapper.h (100%) diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..3a1b066a9 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "libxml-sys/libxml2"] + path = libxml-sys/libxml2 + url = https://github.com/GNOME/libxml2.git + branch = 2.15 diff --git a/Cargo.toml b/Cargo.toml index 825b9c6e2..0a9458fbf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,44 +1,42 @@ -[package] -name = "libxml" +[workspace] +members = ['.', "libxml-sys"] +[workspace.package] version = "0.3.8" edition = "2024" -authors = ["Andreas Franzén ", "Deyan Ginev ","Jan Frederik Schaefer "] +authors = [ + "Andreas Franzén ", + "Deyan Ginev ", + "Jan Frederik Schaefer ", +] description = "A Rust wrapper for libxml2 - the XML C parser and toolkit developed for the Gnome project" repository = "https://github.com/KWARC/rust-libxml" -documentation = "https://kwarc.github.io/rust-libxml/libxml/index.html" readme = "README.md" license = "MIT" -keywords = ["xml", "libxml","xpath", "parser", "html"] -build = "build.rs" -exclude = [ - "scripts/*" -] +keywords = ["xml", "libxml", "xpath", "parser", "html"] + +[package] +name = "libxml" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +repository.workspace = true +documentation = "https://kwarc.github.io/rust-libxml/libxml/index.html" +readme.workspace = true +license.workspace = true +keywords.workspace = true +exclude = ["scripts/*"] [lib] name = "libxml" +[features] +iconv = ["libxml-sys/iconv"] +zlib = ["libxml-sys/zlib"] + [dependencies] libc = "0.2" - -[target.'cfg(all(target_family = "windows", target_env = "msvc"))'.build-dependencies] -vcpkg = "0.2" - -[target.'cfg(all(target_family = "windows", target_env = "gnu"))'.build-dependencies] -pkg-config = "0.3.2" - -[target.'cfg(macos)'.build-dependencies] -pkg-config = "0.3.2" - -[target.'cfg(unix)'.build-dependencies] -pkg-config = "0.3.2" - -[build-dependencies.bindgen] -version = "0.72" -features = [ - "runtime", -] -default-features = false - +libxml-sys = { path = "libxml-sys" } [dev-dependencies] rayon = "1.0.0" criterion = "0.8.0" diff --git a/build.rs b/build.rs deleted file mode 100644 index 852047ad4..000000000 --- a/build.rs +++ /dev/null @@ -1,171 +0,0 @@ -use std::{ - env, fs, - path::{Path, PathBuf}, -}; - -struct ProbedLib { - version: String, - include_paths: Vec, -} - -/// Finds libxml2 and optionally return a list of header -/// files from which the bindings can be generated. -fn find_libxml2() -> Option { - #![allow(unreachable_code)] // for platform-dependent dead code - - if let Ok(ref s) = std::env::var("LIBXML2") { - // println!("{:?}", std::env::vars()); - // panic!("set libxml2."); - let p = std::path::Path::new(s); - let fname = std::path::Path::new( - p.file_name() - .unwrap_or_else(|| panic!("no file name in LIBXML2 env ({s})")), - ); - assert!( - p.is_file(), - "{}", - &format!("not a file in LIBXML2 env ({s})") - ); - println!( - "cargo:rustc-link-lib={}", - fname - .file_stem() - .unwrap() - .to_string_lossy() - .strip_prefix("lib") - .unwrap() - ); - println!( - "cargo:rustc-link-search={}", - p.parent() - .expect("no library path in LIBXML2 env") - .to_string_lossy() - ); - None - } else { - #[cfg(any( - target_family = "unix", - target_os = "macos", - all(target_family = "windows", target_env = "gnu") - ))] - { - let lib = pkg_config::Config::new() - .probe("libxml-2.0") - .expect("Couldn't find libxml2 via pkg-config"); - return Some(ProbedLib { - include_paths: lib.include_paths, - version: lib.version, - }); - } - - #[cfg(all(target_family = "windows", target_env = "msvc"))] - { - if let Some(meta) = vcpkg_dep::vcpkg_find_libxml2() { - return Some(meta); - } else { - eprintln!("vcpkg did not succeed in finding libxml2."); - } - } - - panic!("Could not find libxml2.") - } -} - -fn generate_bindings(header_dirs: Vec, output_path: &Path) { - let bindings = bindgen::Builder::default() - .header("src/wrapper.h") - .opaque_type("max_align_t") - // invalidate build as soon as the wrapper changes - .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) - .layout_tests(true) - .clang_args(&[ - "-DPKG-CONFIG", - "-DLIBXML_C14N_ENABLED", - "-DLIBXML_OUTPUT_ENABLED", - ]) - .clang_args(header_dirs.iter().map(|dir| format!("-I{}", dir.display()))); - bindings - .generate() - .expect("failed to generate bindings with bindgen") - .write_to_file(output_path) - .expect("Failed to write bindings.rs"); -} - -fn main() { - let bindings_path = PathBuf::from(env::var_os("OUT_DIR").unwrap()).join("bindings.rs"); - // declare availability of config variable (without setting it) - println!("cargo::rustc-check-cfg=cfg(libxml_older_than_2_12)"); - - if let Some(probed_lib) = find_libxml2() { - // if we could find header files, generate fresh bindings from them - generate_bindings(probed_lib.include_paths, &bindings_path); - // and expose the libxml2 version to the code - let version_parts: Vec = probed_lib - .version - .split('.') - .map(|part| part.parse::().unwrap_or(-1)) - .collect(); - let older_than_2_12 = version_parts.len() > 1 - && (version_parts[0] < 2 || version_parts[0] == 2 && version_parts[1] < 12); - println!("cargo::rustc-check-cfg=cfg(libxml_older_than_2_12)"); - if older_than_2_12 { - println!("cargo::rustc-cfg=libxml_older_than_2_12"); - } - } else { - // otherwise, use the default bindings on platforms where pkg-config isn't available - fs::copy(PathBuf::from("src/default_bindings.rs"), bindings_path) - .expect("Failed to copy the default bindings to the build directory"); - // for now, assume that the library is older than 2.12, because that's what those bindings are computed with - println!("cargo::rustc-cfg=libxml_older_than_2_12"); - } -} - -#[cfg(all(target_family = "windows", target_env = "msvc"))] -mod vcpkg_dep { - use crate::ProbedLib; - pub fn vcpkg_find_libxml2() -> Option { - if let Ok(metadata) = vcpkg::Config::new().find_package("libxml2") { - let include_paths = metadata - .include_paths - .into_iter() - .fold(Vec::new(), |mut acc, p| { - acc.push(p.join("libxml2")); - acc.push(p); - acc - }); - return Some(ProbedLib { - version: vcpkg_version(), - include_paths, - }); - } - None - } - - fn vcpkg_version() -> String { - // What is the best way to obtain the version on Windows *before* bindgen runs? - // here we attempt asking the shell for "vcpkg list libxml2" - let mut vcpkg_exe = vcpkg::find_vcpkg_root(&vcpkg::Config::new()).unwrap(); - vcpkg_exe.push("vcpkg.exe"); - let vcpkg_list_libxml2 = std::process::Command::new(vcpkg_exe) - .args(["list", "libxml2"]) - .output() - .expect("vcpkg.exe failed to execute in vcpkg_dep build step"); - if vcpkg_list_libxml2.status.success() { - let libxml2_list_str = String::from_utf8_lossy(&vcpkg_list_libxml2.stdout); - for line in libxml2_list_str.lines() { - if line.starts_with("libxml2:") { - let mut version_piece = line.split("2."); - version_piece.next(); - if let Some(version_tail) = version_piece.next() { - if let Some(version) = version_tail.split(' ').next().unwrap().split('#').next() { - return format!("2.{version}"); - } - } - } - } - } - // default to a recent libxml2 from Windows 10 - // (or should this panic?) - String::from("2.13.5") - } -} diff --git a/libxml-sys/Cargo.toml b/libxml-sys/Cargo.toml new file mode 100644 index 000000000..4e1a88054 --- /dev/null +++ b/libxml-sys/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "libxml-sys" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +repository.workspace = true +readme.workspace = true +license.workspace = true +keywords.workspace = true + +[features] +iconv = [] +zlib = [] + +[dependencies] + + +[build-dependencies] +cmake = "0.1.57" +bindgen = "0.72.1" + + +[package.metadata.cross.target.x86_64-pc-windows-gnu] +pre-build = [ + "apt-get update", + "apt-get install -y win-iconv-mingw-w64-dev libz-mingw-w64-dev", +] diff --git a/libxml-sys/README.md b/libxml-sys/README.md new file mode 100644 index 000000000..dc7ec92b0 --- /dev/null +++ b/libxml-sys/README.md @@ -0,0 +1 @@ +* Build libxml2 from source diff --git a/libxml-sys/build.rs b/libxml-sys/build.rs new file mode 100644 index 000000000..a20186fb4 --- /dev/null +++ b/libxml-sys/build.rs @@ -0,0 +1,40 @@ +use std::{env, path::PathBuf}; + +fn main() { + let out_dir = env::var("OUT_DIR").unwrap(); + + let iconv = if cfg!(feature = "iconv") { "ON" } else { "OFF" }; + let zlib = if cfg!(feature = "zlib") { "ON" } else { "OFF" }; + let path = cmake::Config::new("libxml2") + .define("BUILD_SHARED_LIBS", "OFF") + .define("LIBXML2_WITH_ICONV", iconv) + .define("LIBXML2_WITH_ZLIB", zlib) + .define("LIBXML2_WITH_C14N", "ON") + .build(); + + println!("cargo::rerun-if-changed=libxml2"); + + let libs = std::process::Command::new(format!("{}/bin/xml2-config", path.display(),)) + .arg("--libs") + .output() + .expect(""); + let libs = String::from_utf8_lossy(&libs.stdout); + println!("cargo::rustc-flags={}", libs); + + let cflags = std::process::Command::new(format!("{}/bin/xml2-config", path.display(),)) + .arg("--cflags") + .output() + .expect(""); + let cflags = String::from_utf8_lossy(&cflags.stdout); + + let bindings_path = PathBuf::from(out_dir).join("bindings.rs"); + bindgen::builder() + .opaque_type("max_align_t") + .header("src/wrapper.h") + .clang_args(&["-DLIBXML_C14N_ENABLED", "-DLIBXML_OUTPUT_ENABLED"]) + .clang_args(cflags.split_whitespace()) + .generate() + .expect("failed to generate bindings with bindgen") + .write_to_file(bindings_path) + .expect("Failed to write bindings.rs"); +} diff --git a/libxml-sys/libxml2 b/libxml-sys/libxml2 new file mode 160000 index 000000000..aa6db631f --- /dev/null +++ b/libxml-sys/libxml2 @@ -0,0 +1 @@ +Subproject commit aa6db631f3761bcd1fd1fa22b8e83bc14b279d1c diff --git a/libxml-sys/src/lib.rs b/libxml-sys/src/lib.rs new file mode 100644 index 000000000..c09f5368a --- /dev/null +++ b/libxml-sys/src/lib.rs @@ -0,0 +1,15 @@ +mod ffi { + #![allow(non_upper_case_globals)] + #![allow(non_camel_case_types)] + #![allow(non_snake_case)] + #![allow(dead_code)] + #![allow(improper_ctypes)] + #![allow(missing_docs)] + include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +} +/* + * helper var until we figure out well-formedness checks + */ + +pub static mut HACKY_WELL_FORMED: bool = false; +pub use ffi::*; diff --git a/src/wrapper.h b/libxml-sys/src/wrapper.h similarity index 100% rename from src/wrapper.h rename to libxml-sys/src/wrapper.h index fbf0adf40..b79630539 100644 --- a/src/wrapper.h +++ b/libxml-sys/src/wrapper.h @@ -4,12 +4,14 @@ // #include #include // #include +#include +#include +#include +#include #include #include #include #include -#include -#include #include #include #include @@ -17,8 +19,6 @@ #include #include #include -#include -#include #include #include #include @@ -27,10 +27,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/src/bindings.rs b/src/bindings.rs index e1699448d..a043ed9b0 100644 --- a/src/bindings.rs +++ b/src/bindings.rs @@ -6,10 +6,4 @@ #![allow(improper_ctypes)] #![allow(missing_docs)] -/* - * helper var until we figure out well-formedness checks - */ - -pub static mut HACKY_WELL_FORMED: bool = false; - -include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +pub use libxml_sys::*; diff --git a/src/c_helpers.rs b/src/c_helpers.rs index ab05b13dd..011a0b899 100644 --- a/src/c_helpers.rs +++ b/src/c_helpers.rs @@ -97,16 +97,6 @@ pub fn xmlNodeGetName(cur: xmlNodePtr) -> *const c_char { } // dummy function: no debug output at all -#[cfg(libxml_older_than_2_12)] -unsafe extern "C" fn _ignoreInvalidTagsErrorFunc(_user_data: *mut c_void, error: xmlErrorPtr) { - unsafe { - if !error.is_null() && (*error).code as xmlParserErrors == xmlParserErrors_XML_HTML_UNKNOWN_TAG { - // do not record invalid, in fact (out of despair) claim we ARE well-formed, when a tag is invalid. - HACKY_WELL_FORMED = true; - } - } -} -#[cfg(not(libxml_older_than_2_12))] unsafe extern "C" fn _ignoreInvalidTagsErrorFunc(_user_data: *mut c_void, error: *const xmlError) { unsafe { if !error.is_null() && (*error).code as xmlParserErrors == xmlParserErrors_XML_HTML_UNKNOWN_TAG { diff --git a/src/schemas/common.rs b/src/schemas/common.rs index 352114afd..a2f337188 100644 --- a/src/schemas/common.rs +++ b/src/schemas/common.rs @@ -9,17 +9,6 @@ use std::ffi::c_void; /// Provides a callback to the C side of things to accumulate xmlErrors to be /// handled back on the Rust side. -#[cfg(libxml_older_than_2_12)] -pub unsafe extern "C" fn structured_error_handler(ctx: *mut c_void, error: bindings::xmlErrorPtr) { - assert!(!ctx.is_null()); - let errlog = unsafe { &mut *{ ctx as *mut Vec } }; - - let error = unsafe { StructuredError::from_raw(error) }; - - errlog.push(error); -} - -#[cfg(not(libxml_older_than_2_12))] pub unsafe extern "C" fn structured_error_handler(ctx: *mut c_void, error: *const bindings::xmlError) { assert!(!ctx.is_null()); let errlog = unsafe { &mut *{ ctx as *mut Vec } }; From 3cb50a435cdf624c67e4aab72f629022ddab680a Mon Sep 17 00:00:00 2001 From: MaxKing <1512347620@qq.com> Date: Thu, 26 Mar 2026 10:55:11 +0800 Subject: [PATCH 2/3] open ci submodules --- .github/workflows/CI.yml | 4 ++++ .github/workflows/gh-pages.yml | 2 ++ .github/workflows/windows.yml | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ef3d2965e..15892481f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -18,6 +18,8 @@ jobs: run: echo "LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so" >> "$GITHUB_ENV" if: ${{ matrix.with_default_bindings }} - uses: actions/checkout@v2 + with: + submodules: 'true' - uses: actions-rs/toolchain@v1 with: profile: minimal @@ -40,6 +42,8 @@ jobs: with: packages: "libpython3-dev" - uses: actions/checkout@v2 + with: + submodules: 'true' - name: Install libxml ${{ matrix.libxml_version }} by hand run: | wget https://download.gnome.org/sources/libxml2/$(echo ${{ matrix.libxml_version }} | sed -e 's/\.[0-9]*$//')/libxml2-${{ matrix.libxml_version }}.tar.xz diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml index cfc127cdb..cc1693788 100644 --- a/.github/workflows/gh-pages.yml +++ b/.github/workflows/gh-pages.yml @@ -23,6 +23,8 @@ jobs: override: true - name: Checkout sources uses: actions/checkout@v4 + with: + submodules: 'true' - name: Build Documentation uses: actions-rs/cargo@v1 with: diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 55251c3ed..6cf1cf274 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -12,6 +12,8 @@ jobs: VCPKG_ROOT: C:\vcpkg steps: - uses: actions/checkout@v4 + with: + submodules: 'true' - name: Setup vcpkg libxml2 Cache uses: actions/cache@v4 id: vcpkg-cache @@ -35,6 +37,8 @@ jobs: shell: msys2 {0} steps: - uses: actions/checkout@v4 + with: + submodules: 'true' - uses: msys2/setup-msys2@v2 with: path-type: minimal From 6a4932d46c85811396cb00f660a5f462ccaef5e1 Mon Sep 17 00:00:00 2001 From: MaxKing <1512347620@qq.com> Date: Thu, 26 Mar 2026 16:05:29 +0800 Subject: [PATCH 3/3] fix: ci --- libxml-sys/Cargo.toml | 2 +- libxml-sys/build.rs | 52 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/libxml-sys/Cargo.toml b/libxml-sys/Cargo.toml index 4e1a88054..c7964a265 100644 --- a/libxml-sys/Cargo.toml +++ b/libxml-sys/Cargo.toml @@ -19,7 +19,7 @@ zlib = [] [build-dependencies] cmake = "0.1.57" bindgen = "0.72.1" - +regex = "1.12.3" [package.metadata.cross.target.x86_64-pc-windows-gnu] pre-build = [ diff --git a/libxml-sys/build.rs b/libxml-sys/build.rs index a20186fb4..6d02cf893 100644 --- a/libxml-sys/build.rs +++ b/libxml-sys/build.rs @@ -1,10 +1,11 @@ -use std::{env, path::PathBuf}; +use std::{env, fs, path::PathBuf}; fn main() { let out_dir = env::var("OUT_DIR").unwrap(); let iconv = if cfg!(feature = "iconv") { "ON" } else { "OFF" }; let zlib = if cfg!(feature = "zlib") { "ON" } else { "OFF" }; + let path = cmake::Config::new("libxml2") .define("BUILD_SHARED_LIBS", "OFF") .define("LIBXML2_WITH_ICONV", iconv) @@ -14,19 +15,52 @@ fn main() { println!("cargo::rerun-if-changed=libxml2"); - let libs = std::process::Command::new(format!("{}/bin/xml2-config", path.display(),)) + let host = env::var("HOST").unwrap(); + + let mut libs = std::process::Command::new("sh") + .arg(path.join("bin/xml2-config")) .arg("--libs") .output() - .expect(""); - let libs = String::from_utf8_lossy(&libs.stdout); - println!("cargo::rustc-flags={}", libs); - - let cflags = std::process::Command::new(format!("{}/bin/xml2-config", path.display(),)) + .map(|output| String::from_utf8_lossy(&output.stdout).to_string()); + let mut cflags = std::process::Command::new("sh") + .arg(path.join("bin/xml2-config")) .arg("--cflags") .output() - .expect(""); - let cflags = String::from_utf8_lossy(&cflags.stdout); + .map(|output| String::from_utf8_lossy(&output.stdout).to_string()); + + if host.contains("windows") { + let reg = regex::Regex::new("-(.)/(.)/").expect("reg"); + libs = libs.map(|v| reg.replace_all(&v, "-$1$2:/").to_string()); + cflags = cflags.map(|v| reg.replace_all(&v, "-$1$2:/").to_string()); + } + // NOTE: Manually specify + let mut libs = libs.unwrap_or_else(|_| format!("-L{} -lxml2", path.join("lib").display())); + if host.contains("msvc") { + let mut iters = fs::read_dir(path.join("lib")) + .expect("read_dir") + .filter_map(|p| { + p.ok().and_then(|p| { + let metadata = p.metadata().ok()?; + let file_name = p.file_name(); + let name = file_name.to_string_lossy(); + if metadata.is_file() && name.starts_with("libxml2") && name.ends_with(".lib") { + return Some( + name + .trim_end_matches(".lib") + .to_string(), + ); + } + None + }) + }); + let name = iters.next().expect("xml name"); + println!("cargo:rustc-link-lib=bcrypt"); + libs = libs.replace("-lxml2", &format!("-l{}", name)); + } + println!("cargo::rustc-flags={}", libs); + // Note: Manually specify + let cflags = cflags.unwrap_or_else(|_| format!("-I{}", path.join("include/libxml2").display())); let bindings_path = PathBuf::from(out_dir).join("bindings.rs"); bindgen::builder() .opaque_type("max_align_t")