Collect data from glottolog as well as cldr.
This commit is contained in:
@@ -1,159 +1,31 @@
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use anyhow::Result;
|
||||
use isolang::Language;
|
||||
use quick_xml::{events::Event, Reader};
|
||||
use thiserror::Error;
|
||||
use argh::FromArgs;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
enum Error {
|
||||
#[error("xml error")]
|
||||
Xml(#[from] quick_xml::Error),
|
||||
mod cldr;
|
||||
mod glottolog;
|
||||
mod merge;
|
||||
|
||||
#[derive(FromArgs, PartialEq, Debug)]
|
||||
/// txtlang-gen.
|
||||
struct Opt {
|
||||
#[argh(subcommand)]
|
||||
cmd: Command,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Debug)]
|
||||
struct LocalDisplayName {
|
||||
name: String,
|
||||
lang: Language,
|
||||
#[derive(FromArgs, PartialEq, Debug)]
|
||||
#[argh(subcommand)]
|
||||
enum Command {
|
||||
Cldr(cldr::Cldr),
|
||||
Glottolog(glottolog::Glottolog),
|
||||
Merge(merge::Merge),
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let inputs = env::args().skip(1);
|
||||
let opt: Opt = argh::from_env();
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for input in inputs {
|
||||
let mut reader = File::open(&input)
|
||||
.map(BufReader::new)
|
||||
.map(Reader::from_reader)?;
|
||||
|
||||
reader.trim_text(true);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event(&mut buf) {
|
||||
Ok(Event::Start(ref e)) if e.name() == b"localeDisplayNames" => {
|
||||
buf.clear();
|
||||
|
||||
results.extend(parse::locale_display_names(&mut reader, &mut buf));
|
||||
}
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
}
|
||||
|
||||
let before = results.len();
|
||||
|
||||
results.sort_unstable_by(|a, b| a.name.cmp(&b.name));
|
||||
results.dedup();
|
||||
|
||||
let dedup = results.len();
|
||||
|
||||
let mut remove = Vec::new();
|
||||
|
||||
results.dedup_by(|a, b| {
|
||||
if a.name == b.name {
|
||||
remove.push(a.name.clone());
|
||||
}
|
||||
|
||||
a.name == b.name
|
||||
});
|
||||
|
||||
results.retain(|x| !remove.contains(&x.name));
|
||||
|
||||
let clean = results.len();
|
||||
|
||||
eprintln!("before={}", before);
|
||||
eprintln!("dedup={}", dedup);
|
||||
eprintln!("clean={}", clean);
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut lock = stdout.lock();
|
||||
|
||||
for result in results {
|
||||
assert!(!result.name.contains('\n'));
|
||||
assert!(!result.name.contains('\t'));
|
||||
|
||||
writeln!(lock, "{}\t{}", result.name, result.lang.to_639_3())
|
||||
.expect("failed to write to stdout");
|
||||
}
|
||||
|
||||
lock.flush().expect("failed to flush to stdout");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
mod parse {
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use anyhow::Result;
|
||||
use isolang::Language;
|
||||
use quick_xml::{events::Event, Reader};
|
||||
|
||||
use crate::LocalDisplayName;
|
||||
|
||||
pub(crate) fn locale_display_names(
|
||||
reader: &mut Reader<BufReader<File>>,
|
||||
buf: &mut Vec<u8>,
|
||||
) -> Vec<LocalDisplayName> {
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut parse = false;
|
||||
let mut language = None;
|
||||
|
||||
loop {
|
||||
match reader.read_event(buf) {
|
||||
Ok(Event::End(ref e)) if e.name() == b"localeDisplayNames" => {
|
||||
break;
|
||||
}
|
||||
Ok(Event::Start(ref e)) if e.name() == b"languages" => {
|
||||
parse = true;
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == b"languages" => {
|
||||
parse = false;
|
||||
}
|
||||
Ok(Event::Start(ref e)) if e.name() == b"language" && parse => {
|
||||
let a = e
|
||||
.attributes()
|
||||
.filter_map(Result::ok)
|
||||
.find(|a| a.key == b"type");
|
||||
|
||||
if let Some(a) = a {
|
||||
if let Ok(lang) = reader.decode_without_bom(&a.value) {
|
||||
language = Language::from_639_1(lang)
|
||||
.or_else(|| Language::from_639_3(lang))
|
||||
.or_else(|| Language::from_locale(lang));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == b"language" && parse => {
|
||||
language = None;
|
||||
}
|
||||
Ok(Event::Text(e)) if language.is_some() => {
|
||||
if let Ok(text) = reader.decode_without_bom(&e) {
|
||||
result.push(LocalDisplayName {
|
||||
name: text.to_lowercase(),
|
||||
lang: language.take().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
result
|
||||
match opt.cmd {
|
||||
Command::Cldr(cmd) => cmd.run(),
|
||||
Command::Glottolog(cmd) => cmd.run(),
|
||||
Command::Merge(cmd) => cmd.run(),
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user