Initial commit

2022-05-19 23:26:00 +02:00
commit 8a8baffba8
53 changed files with 761345 additions and 0 deletions
--- a/crates/parse/src/lib.rs
+++ b/crates/parse/src/lib.rs
@@ -0,0 +1 @@
+pub mod uca;
--- a/crates/parse/src/uca.rs
+++ b/crates/parse/src/uca.rs
@@ -0,0 +1 @@
+pub mod allkeys;
--- a/crates/parse/src/uca/allkeys.rs
+++ b/crates/parse/src/uca/allkeys.rs
@@ -0,0 +1,192 @@
+//! Parse allkeys.txt
+//!
+//! See http://unicode.org/reports/tr10/#File_Format for information about file format.
+
+#[derive(Debug, Default)]
+pub struct AllKeys {
+    pub version: Option<Version>,
+    pub implicit_weights: Vec<ImplicitWeight>,
+    pub entries: Vec<Entry>,
+}
+
+#[derive(Debug)]
+pub struct Version {
+    pub major: u16,
+    pub minor: u16,
+    pub variant: u16,
+}
+
+#[derive(Debug)]
+pub struct ImplicitWeight {
+    pub start: u32,
+    pub end: u32,
+    pub base: u32,
+    pub comment: Option<String>,
+}
+
+#[derive(Debug)]
+pub struct Entry {
+    pub chars: Vec<u32>,
+    pub elements: Vec<Element>,
+    pub comment: Option<String>,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct Element {
+    pub l1: u16,
+    pub l2: u16,
+    pub l3: u8,
+    pub l4: u16,
+    pub variable: bool,
+}
+
+pub fn parse(input: &str) -> AllKeys {
+    let mut all_keys = AllKeys::default();
+
+    for line in input.lines() {
+        let line = line.trim();
+
+        // If the line is empty, there is nothing to do wight it
+        if line.is_empty() {
+            continue;
+        }
+
+        // If a line starts with '#', it is a comment, so skip it
+        if line.starts_with('#') {
+            continue;
+        }
+
+        if line.starts_with("@version") {
+            let mut iter = line.trim_start_matches("@version").trim().splitn(3, '.');
+
+            all_keys.version = Some(Version {
+                major: iter.next().unwrap().parse().unwrap(),
+                minor: iter.next().unwrap().parse().unwrap(),
+                variant: iter.next().unwrap().parse().unwrap(),
+            });
+        } else if line.starts_with("@implicitweights") {
+            let (range, base) = line
+                .trim_start_matches("@implicitweights")
+                .trim()
+                .split_once(';')
+                .unwrap();
+
+            let (start, end) = range.split_once("..").unwrap();
+            let (base, comment) = base.split_once('#').unwrap();
+
+            let comment = comment.trim();
+
+            all_keys.implicit_weights.push(ImplicitWeight {
+                start: u32::from_str_radix(start.trim(), 16).unwrap(),
+                end: u32::from_str_radix(end.trim(), 16).unwrap(),
+                base: u32::from_str_radix(base.trim(), 16).unwrap(),
+                comment: if !comment.is_empty() {
+                    Some(comment.to_string())
+                } else {
+                    None
+                },
+            });
+        } else {
+            let (chars, rest) = line.split_once(';').unwrap();
+
+            let chars = chars
+                .trim()
+                .split(' ')
+                .map(|x| u32::from_str_radix(x, 16).unwrap())
+                .collect::<Vec<_>>();
+
+            let (elements, comment) = rest.split_once('#').unwrap();
+            let comment = comment.trim();
+
+            let elements = elements
+                .split("][")
+                .map(|coll_element| {
+                    let coll_element = coll_element
+                        .trim()
+                        .trim_start_matches('[')
+                        .trim_end_matches(']');
+
+                    let variable = coll_element.starts_with('*');
+
+                    let mut iter = coll_element
+                        .trim_start_matches(['.', '*'])
+                        .split(['.', '*']);
+
+                    Element {
+                        l1: iter
+                            .next()
+                            .and_then(|x| u16::from_str_radix(x, 16).ok())
+                            .expect("valid l1 value"),
+                        l2: iter
+                            .next()
+                            .and_then(|x| u16::from_str_radix(x, 16).ok())
+                            .expect("valid l2 value"),
+                        l3: iter
+                            .next()
+                            .and_then(|x| u8::from_str_radix(x, 16).ok())
+                            .expect("valid l3 value"),
+                        l4: iter
+                            .next()
+                            .map(|x| u16::from_str_radix(x, 16).expect("valid l4 value"))
+                            .unwrap_or(0),
+                        variable,
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            all_keys.entries.push(Entry {
+                chars,
+                elements,
+                comment: if !comment.is_empty() {
+                    Some(comment.to_string())
+                } else {
+                    None
+                },
+            });
+        }
+    }
+
+    all_keys
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_allkeys() {
+        let data = std::fs::read_to_string("data/allkeys.txt").unwrap();
+
+        let all_keys = parse(&data);
+
+        similar_asserts::assert_eq!(
+            all_keys
+                .entries
+                .iter()
+                .find(|entry| entry.chars[..] == [0x1abc])
+                .map(|entry| &entry.elements),
+            Some(&vec![Element {
+                l1: 0,
+                l2: 51,
+                l3: 2,
+                l4: 0,
+                variable: false
+            }])
+        );
+
+        similar_asserts::assert_eq!(
+            all_keys
+                .entries
+                .iter()
+                .find(|entry| entry.chars[..] == [0x1ac1])
+                .map(|entry| &entry.elements),
+            Some(&vec![Element {
+                l1: 0,
+                l2: 51,
+                l3: 2,
+                l4: 0,
+                variable: false
+            }])
+        );
+    }
+}