Progress

aarranz · aarranz · commit 73219d8cda22 · 2024-07-18T03:34:44.000+02:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,9 +9,15 @@ name = "simple_unicode_normalization_forms"
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = "0.22.0"
+lazy_static = "1.5.0"
+regex = "1.10.5"
 unicode-normalization = "0.1.23"
 
+[dependencies.pyo3]
+version = "0.22.0"
+# "abi3-py38" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.8
+features = ["abi3-py38"]
+
 [target.aarch64-apple-darwin]
 rustflags = [
   "-C", "link-arg=-undefined",
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,23 +1,54 @@
+// Copyright (c) 2024 Future Internet Consulting and Development Solutions S.L.
+
+use lazy_static::lazy_static;
+use regex::Regex;
 use pyo3::prelude::*;
+use std::collections::HashSet;
 use unicode_normalization::char::decompose_compatible;
+use unicode_normalization::UnicodeNormalization;
+
+lazy_static! {
+    static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
+    static ref EMOJI_RE: Regex = Regex::new(r"[\p{Emoji_Presentation}\p{Emoji_Modifier}\p{Emoji_Modifier_Base}\{Cc}\uFE0E\uFE0F\u20E2\u20E3\u20E4]").unwrap();
+}
 
 /// Gives the normalized form of a string skipping some characters.
-#[pyfunction]
-fn nfkc_normalization(str: String, allow_chars: Vec<char>) -> PyResult<String> {
-    let mut result = String::with_capacity(str.len() * 2);
+fn nfkc_normalization(str: String, allow_chars: HashSet<char>) -> String {
+    let mut result = String::with_capacity(str.len());
     for c in str.chars() {
         if allow_chars.contains(&c) {
             result.push(c)
         } else {
-            decompose_compatible(c, |r| result.push(r))
+            decompose_compatible(c, |r| {
+                // Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
+                if r <= '\u{FFFF}' {
+                    result.push(r)
+                }
+            })
         }
     }
-    Ok(result)
+
+    result.nfc().collect::<String>()
+}
+
+#[pyfunction]
+fn basic_string_clean(value: String) -> PyResult<String> {
+    Ok(nfkc_normalization(value, HashSet::from(['º', 'ª'])).trim().to_string())
+}
+
+#[pyfunction]
+fn remove_emojis(value: String) -> PyResult<String> {
+    let cleaned_value = nfkc_normalization(value, HashSet::from(['º', 'ª']));
+    let whitespace_cleaned_value = WHITESPACE_RE.replace_all(&cleaned_value, " ");
+    let result = EMOJI_RE.replace_all(&whitespace_cleaned_value, "");
+
+    Ok(result.trim().to_string())
 }
 
 /// A Python module implemented in Rust.
 #[pymodule]
-fn ficodes_string_normalization(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_function(wrap_pyfunction!(nfkc_normalization, m)?)?;
+fn simple_unicode_normalization_forms(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_function(wrap_pyfunction!(basic_string_clean, m)?)?;
+    m.add_function(wrap_pyfunction!(remove_emojis, m)?)?;
     Ok(())
 }