Skip to content

Commit 73219d8

Browse files
committed
Progress
1 parent 75e1996 commit 73219d8

File tree

3 files changed

+97
-8
lines changed

3 files changed

+97
-8
lines changed

Cargo.lock

Lines changed: 52 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,15 @@ name = "simple_unicode_normalization_forms"
99
crate-type = ["cdylib"]
1010

1111
[dependencies]
12-
pyo3 = "0.22.0"
12+
lazy_static = "1.5.0"
13+
regex = "1.10.5"
1314
unicode-normalization = "0.1.23"
1415

16+
[dependencies.pyo3]
17+
version = "0.22.0"
18+
# "abi3-py38" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.8
19+
features = ["abi3-py38"]
20+
1521
[target.aarch64-apple-darwin]
1622
rustflags = [
1723
"-C", "link-arg=-undefined",

src/lib.rs

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,54 @@
1+
// Copyright (c) 2024 Future Internet Consulting and Development Solutions S.L.
2+
3+
use lazy_static::lazy_static;
4+
use regex::Regex;
15
use pyo3::prelude::*;
6+
use std::collections::HashSet;
27
use unicode_normalization::char::decompose_compatible;
8+
use unicode_normalization::UnicodeNormalization;
9+
10+
lazy_static! {
11+
static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
12+
static ref EMOJI_RE: Regex = Regex::new(r"[\p{Emoji_Presentation}\p{Emoji_Modifier}\p{Emoji_Modifier_Base}\{Cc}\uFE0E\uFE0F\u20E2\u20E3\u20E4]").unwrap();
13+
}
314

415
/// Gives the normalized form of a string skipping some characters.
5-
#[pyfunction]
6-
fn nfkc_normalization(str: String, allow_chars: Vec<char>) -> PyResult<String> {
7-
let mut result = String::with_capacity(str.len() * 2);
16+
fn nfkc_normalization(str: String, allow_chars: HashSet<char>) -> String {
17+
let mut result = String::with_capacity(str.len());
818
for c in str.chars() {
919
if allow_chars.contains(&c) {
1020
result.push(c)
1121
} else {
12-
decompose_compatible(c, |r| result.push(r))
22+
decompose_compatible(c, |r| {
23+
// Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set
24+
if r <= '\u{FFFF}' {
25+
result.push(r)
26+
}
27+
})
1328
}
1429
}
15-
Ok(result)
30+
31+
result.nfc().collect::<String>()
32+
}
33+
34+
#[pyfunction]
35+
fn basic_string_clean(value: String) -> PyResult<String> {
36+
Ok(nfkc_normalization(value, HashSet::from(['º', 'ª'])).trim().to_string())
37+
}
38+
39+
#[pyfunction]
40+
fn remove_emojis(value: String) -> PyResult<String> {
41+
let cleaned_value = nfkc_normalization(value, HashSet::from(['º', 'ª']));
42+
let whitespace_cleaned_value = WHITESPACE_RE.replace_all(&cleaned_value, " ");
43+
let result = EMOJI_RE.replace_all(&whitespace_cleaned_value, "");
44+
45+
Ok(result.trim().to_string())
1646
}
1747

1848
/// A Python module implemented in Rust.
1949
#[pymodule]
20-
fn ficodes_string_normalization(m: &Bound<'_, PyModule>) -> PyResult<()> {
21-
m.add_function(wrap_pyfunction!(nfkc_normalization, m)?)?;
50+
fn simple_unicode_normalization_forms(m: &Bound<'_, PyModule>) -> PyResult<()> {
51+
m.add_function(wrap_pyfunction!(basic_string_clean, m)?)?;
52+
m.add_function(wrap_pyfunction!(remove_emojis, m)?)?;
2253
Ok(())
2354
}

0 commit comments

Comments
 (0)