|
| 1 | +// Copyright (c) 2024 Future Internet Consulting and Development Solutions S.L. |
| 2 | + |
| 3 | +use lazy_static::lazy_static; |
| 4 | +use regex::Regex; |
1 | 5 | use pyo3::prelude::*; |
| 6 | +use std::collections::HashSet; |
2 | 7 | use unicode_normalization::char::decompose_compatible; |
| 8 | +use unicode_normalization::UnicodeNormalization; |
| 9 | + |
| 10 | +lazy_static! { |
| 11 | + static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap(); |
| 12 | + static ref EMOJI_RE: Regex = Regex::new(r"[\p{Emoji_Presentation}\p{Emoji_Modifier}\p{Emoji_Modifier_Base}\{Cc}\uFE0E\uFE0F\u20E2\u20E3\u20E4]").unwrap(); |
| 13 | +} |
3 | 14 |
|
4 | 15 | /// Gives the normalized form of a string skipping some characters. |
5 | | -#[pyfunction] |
6 | | -fn nfkc_normalization(str: String, allow_chars: Vec<char>) -> PyResult<String> { |
7 | | - let mut result = String::with_capacity(str.len() * 2); |
| 16 | +fn nfkc_normalization(str: String, allow_chars: HashSet<char>) -> String { |
| 17 | + let mut result = String::with_capacity(str.len()); |
8 | 18 | for c in str.chars() { |
9 | 19 | if allow_chars.contains(&c) { |
10 | 20 | result.push(c) |
11 | 21 | } else { |
12 | | - decompose_compatible(c, |r| result.push(r)) |
| 22 | + decompose_compatible(c, |r| { |
| 23 | + // Ignore characters outside the Basic Multilingual Plane and in the disallow_chars set |
| 24 | + if r <= '\u{FFFF}' { |
| 25 | + result.push(r) |
| 26 | + } |
| 27 | + }) |
13 | 28 | } |
14 | 29 | } |
15 | | - Ok(result) |
| 30 | + |
| 31 | + result.nfc().collect::<String>() |
| 32 | +} |
| 33 | + |
| 34 | +#[pyfunction] |
| 35 | +fn basic_string_clean(value: String) -> PyResult<String> { |
| 36 | + Ok(nfkc_normalization(value, HashSet::from(['º', 'ª'])).trim().to_string()) |
| 37 | +} |
| 38 | + |
| 39 | +#[pyfunction] |
| 40 | +fn remove_emojis(value: String) -> PyResult<String> { |
| 41 | + let cleaned_value = nfkc_normalization(value, HashSet::from(['º', 'ª'])); |
| 42 | + let whitespace_cleaned_value = WHITESPACE_RE.replace_all(&cleaned_value, " "); |
| 43 | + let result = EMOJI_RE.replace_all(&whitespace_cleaned_value, ""); |
| 44 | + |
| 45 | + Ok(result.trim().to_string()) |
16 | 46 | } |
17 | 47 |
|
18 | 48 | /// A Python module implemented in Rust. |
19 | 49 | #[pymodule] |
20 | | -fn ficodes_string_normalization(m: &Bound<'_, PyModule>) -> PyResult<()> { |
21 | | - m.add_function(wrap_pyfunction!(nfkc_normalization, m)?)?; |
| 50 | +fn simple_unicode_normalization_forms(m: &Bound<'_, PyModule>) -> PyResult<()> { |
| 51 | + m.add_function(wrap_pyfunction!(basic_string_clean, m)?)?; |
| 52 | + m.add_function(wrap_pyfunction!(remove_emojis, m)?)?; |
22 | 53 | Ok(()) |
23 | 54 | } |
0 commit comments