Collect data from glottolog as well as cldr.

This commit is contained in:
2020-10-02 13:07:13 +02:00
parent 0286beb89c
commit 08914685e1
9 changed files with 67669 additions and 45958 deletions

1
txtlang-gen/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
*.tsv

View File

@@ -6,6 +6,10 @@ edition = "2018"
[dependencies]
anyhow = "1.0"
argh = "0.1"
bstr = "0.2"
isolang = "1.0"
quick-xml = "0.18"
thiserror = "1.0"
once_cell = "1.4"
quick-xml = "0.19"
regex = "1.3"
walkdir = "2.3"

182
txtlang-gen/src/cldr.rs Normal file
View File

@@ -0,0 +1,182 @@
use std::fs::File;
use std::io::BufReader;
use std::io::{self, Write};
use std::path::PathBuf;
use anyhow::Result;
use argh::FromArgs;
use isolang::Language;
use quick_xml::{events::Event, Reader};
#[derive(PartialEq, Debug)]
struct LocalDisplayName {
name: String,
lang: Language,
}
#[derive(FromArgs, PartialEq, Debug)]
/// Parse CLDR data.
#[argh(subcommand, name = "cldr")]
pub struct Cldr {
#[argh(positional)]
files: Vec<PathBuf>,
}
impl Cldr {
pub fn run(&self) -> Result<()> {
let mut entries = Vec::new();
for file in &self.files {
let mut reader = File::open(&file)
.map(BufReader::new)
.map(Reader::from_reader)?;
reader.trim_text(true);
let mut buf = Vec::new();
let mut language = None;
let mut results = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) if e.name() == b"identity" => {
buf.clear();
language = parse_language(&mut reader, &mut buf)
}
Ok(Event::Start(ref e)) if e.name() == b"localeDisplayNames" => {
buf.clear();
results.extend(parse_locale_display_names(&mut reader, &mut buf));
}
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
let source_language = language.unwrap_or(Language::Und);
for entry in results {
entries.push((entry.lang, source_language, entry.name));
}
}
entries.sort_unstable_by_key(|(language, source_language, _)| {
(language.to_639_3(), source_language.to_639_3())
});
let stdout = io::stdout();
let mut lock = stdout.lock();
for (language, source_language, name) in entries {
assert!(!name.is_empty());
assert!(!name.contains('\n'));
assert!(!name.contains('\t'));
writeln!(
lock,
"{}\t{}\t{}\tcldr",
language.to_639_3(),
source_language.to_639_3(),
name
)?;
}
lock.flush()?;
Ok(())
}
}
fn parse_language(reader: &mut Reader<BufReader<File>>, buf: &mut Vec<u8>) -> Option<Language> {
let mut language = None;
loop {
match reader.read_event(buf) {
Ok(Event::Empty(ref e)) if e.name() == b"language" => {
let a = e
.attributes()
.filter_map(Result::ok)
.find(|a| a.key == b"type");
if let Some(a) = a {
if let Ok(lang) = reader.decode_without_bom(&a.value) {
language = Language::from_639_1(lang)
.or_else(|| Language::from_639_3(lang))
.or_else(|| Language::from_locale(lang));
}
}
}
Ok(Event::End(ref e)) if e.name() == b"identity" => {
break;
}
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
language
}
fn parse_locale_display_names(
reader: &mut Reader<BufReader<File>>,
buf: &mut Vec<u8>,
) -> Vec<LocalDisplayName> {
let mut result = Vec::new();
let mut parse = false;
let mut language = None;
loop {
match reader.read_event(buf) {
Ok(Event::End(ref e)) if e.name() == b"localeDisplayNames" => {
break;
}
Ok(Event::Start(ref e)) if e.name() == b"languages" => {
parse = true;
}
Ok(Event::End(ref e)) if e.name() == b"languages" => {
parse = false;
}
Ok(Event::Start(ref e)) if e.name() == b"language" && parse => {
let a = e
.attributes()
.filter_map(Result::ok)
.find(|a| a.key == b"type");
if let Some(a) = a {
if let Ok(lang) = reader.decode_without_bom(&a.value) {
language = Language::from_639_1(lang)
.or_else(|| Language::from_639_3(lang))
.or_else(|| Language::from_locale(lang));
}
}
}
Ok(Event::End(ref e)) if e.name() == b"language" && parse => {
language = None;
}
Ok(Event::Text(e)) if language.is_some() => {
if let Ok(text) = reader.decode_without_bom(&e) {
result.push(LocalDisplayName {
name: text.to_lowercase(),
lang: language.take().unwrap(),
});
}
}
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
result
}

View File

@@ -0,0 +1,259 @@
use std::fs;
use std::io::{self, Write};
use std::path::{Path, PathBuf};
use anyhow::Result;
use argh::FromArgs;
use isolang::Language;
use once_cell::sync::Lazy;
use regex::Regex;
use walkdir::{DirEntry, WalkDir};
#[derive(FromArgs, PartialEq, Debug)]
/// Parse Glottolog data.
#[argh(subcommand, name = "glottolog")]
pub struct Glottolog {
#[argh(positional)]
path: PathBuf,
}
impl Glottolog {
pub fn run(&self) -> Result<()> {
let files = if self.path.is_dir() {
WalkDir::new(&self.path)
.into_iter()
.filter_map(Result::ok)
.filter(is_match)
.map(|e| e.path().to_owned())
.collect::<Vec<_>>()
} else {
vec![self.path.clone()]
};
let mut entries = Vec::new();
for file in files {
if let Some(entry) = Entry::from_path(&file) {
for (name, source) in entry.alt_names {
entries.push((entry.language, source.unwrap_or(Language::Und), name));
}
}
}
entries.sort_unstable_by_key(|(language, source_language, _)| {
(language.to_639_3(), source_language.to_639_3())
});
let stdout = io::stdout();
let mut lock = stdout.lock();
for (language, source_language, name) in entries {
assert!(!name.is_empty());
assert!(!name.contains('\n'));
assert!(!name.contains('\t'));
writeln!(
lock,
"{}\t{}\t{}\tglottolog",
language.to_639_3(),
source_language.to_639_3(),
name
)?;
}
lock.flush()?;
Ok(())
}
}
#[derive(Debug)]
struct Entry {
language: Language,
alt_names: Vec<(String, Option<Language>)>,
}
impl Entry {
fn from_path(path: &Path) -> Option<Entry> {
static RE_LEXVO: Lazy<Regex> = Lazy::new(|| Regex::new(r"(.*) \[([a-z]{2,3})\]").unwrap());
let text = fs::read_to_string(path).ok()?;
let data = ini::parse(&text)?;
if !data
.get(&Some("core".to_string()))?
.get("level")
.map(|level| level == "language")
.unwrap_or(false)
{
return None;
}
let language = data
.get(&Some("core".to_string()))?
.get("iso639-3")
.and_then(|iso| Language::from_639_3(iso))?;
let mut alt_names = Vec::new();
for (source, names) in data.get(&Some("altnames".to_string()))? {
if source == "lexvo" {
let names = names.lines().map(|line| line.trim()).collect::<Vec<_>>();
for name in names {
if let Some(cap) = RE_LEXVO.captures(name) {
let source =
Language::from_639_1(&cap[2]).or_else(|| Language::from_639_3(&cap[2]));
let name = cap[1].trim().to_lowercase();
if name.is_empty() {
continue;
}
alt_names.push((name, source));
} else {
let name = name.trim().to_lowercase();
if name.is_empty() {
continue;
}
alt_names.push((name, None));
}
}
}
}
Some(Entry {
language,
alt_names,
})
}
}
fn is_match(entry: &DirEntry) -> bool {
entry
.file_name()
.to_str()
.map(|s| s == "md.ini")
.unwrap_or(false)
}
mod ini {
use std::collections::HashMap;
use bstr::ByteSlice;
pub fn parse(
input: impl AsRef<[u8]>,
) -> Option<HashMap<Option<String>, HashMap<String, String>>> {
let mut result: HashMap<Option<String>, HashMap<String, String>> = HashMap::new();
let mut current_section = None;
let mut input = input.as_ref();
while !input.is_empty() {
input = eat_whitespace(input);
if input.is_empty() {
break;
}
match input[0] {
b'#' => {
let (rest, _comment) = parse_comment(&input[1..])?;
input = rest;
}
b'[' => {
let (rest, section) = parse_section(&input[1..])?;
input = rest;
current_section = Some(
result
.entry(Some(section.to_str_lossy().to_string()))
.or_insert_with(HashMap::new),
);
}
_ => {
let (rest, key, value) = parse_key_value(&input)?;
input = rest;
if current_section.is_none() {
current_section = Some(result.entry(None).or_insert_with(HashMap::new));
}
if let Some(current_section) = &mut current_section {
current_section.insert(
key.to_str_lossy().to_string(),
value.to_str_lossy().to_string(),
);
}
}
}
}
Some(result)
}
fn eat_whitespace(input: &[u8]) -> &[u8] {
input.trim_start()
}
fn parse_comment(input: &[u8]) -> Option<(&[u8], &[u8])> {
let input = eat_whitespace(input);
let end = input.find_byte(b'\n')?;
let comment = &input[..end];
Some((&input[end..], comment.trim()))
}
fn parse_section(input: &[u8]) -> Option<(&[u8], &[u8])> {
let input = eat_whitespace(input);
let end = input.find_byte(b']')?;
let section = &input[..end];
Some((&input[end + 1..], section.trim()))
}
fn parse_key_value(mut input: &[u8]) -> Option<(&[u8], &[u8], &[u8])> {
let end = input.find_byte(b'=')?;
let key = &input[..end];
input = &input[end + 1..];
let end = input.find_byte(b'\n')?;
if !input[..end].trim().is_empty() {
let value = &input[..end];
Some((&input[end + 1..], key.trim(), value.trim()))
} else {
let start = end + 1;
let mut idx = start;
loop {
if input[idx] != b'\t' {
break;
}
idx += input[idx..].find_byte(b'\n')?;
idx += 1;
}
let value = if idx != start {
&input[start..idx - 1]
} else {
&input[start..idx]
};
Some((&input[idx..], key.trim(), value.trim()))
}
}
}

View File

@@ -1,159 +1,31 @@
use std::env;
use std::fs::File;
use std::io::BufReader;
use std::io::{self, Write};
use anyhow::Result;
use isolang::Language;
use quick_xml::{events::Event, Reader};
use thiserror::Error;
use argh::FromArgs;
#[derive(Error, Debug)]
enum Error {
#[error("xml error")]
Xml(#[from] quick_xml::Error),
mod cldr;
mod glottolog;
mod merge;
#[derive(FromArgs, PartialEq, Debug)]
/// txtlang-gen.
struct Opt {
#[argh(subcommand)]
cmd: Command,
}
#[derive(PartialEq, Debug)]
struct LocalDisplayName {
name: String,
lang: Language,
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand)]
enum Command {
Cldr(cldr::Cldr),
Glottolog(glottolog::Glottolog),
Merge(merge::Merge),
}
fn main() -> Result<()> {
let inputs = env::args().skip(1);
let opt: Opt = argh::from_env();
let mut results = Vec::new();
for input in inputs {
let mut reader = File::open(&input)
.map(BufReader::new)
.map(Reader::from_reader)?;
reader.trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) if e.name() == b"localeDisplayNames" => {
buf.clear();
results.extend(parse::locale_display_names(&mut reader, &mut buf));
}
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
}
let before = results.len();
results.sort_unstable_by(|a, b| a.name.cmp(&b.name));
results.dedup();
let dedup = results.len();
let mut remove = Vec::new();
results.dedup_by(|a, b| {
if a.name == b.name {
remove.push(a.name.clone());
}
a.name == b.name
});
results.retain(|x| !remove.contains(&x.name));
let clean = results.len();
eprintln!("before={}", before);
eprintln!("dedup={}", dedup);
eprintln!("clean={}", clean);
let stdout = io::stdout();
let mut lock = stdout.lock();
for result in results {
assert!(!result.name.contains('\n'));
assert!(!result.name.contains('\t'));
writeln!(lock, "{}\t{}", result.name, result.lang.to_639_3())
.expect("failed to write to stdout");
}
lock.flush().expect("failed to flush to stdout");
Ok(())
}
mod parse {
use std::fs::File;
use std::io::BufReader;
use anyhow::Result;
use isolang::Language;
use quick_xml::{events::Event, Reader};
use crate::LocalDisplayName;
pub(crate) fn locale_display_names(
reader: &mut Reader<BufReader<File>>,
buf: &mut Vec<u8>,
) -> Vec<LocalDisplayName> {
let mut result = Vec::new();
let mut parse = false;
let mut language = None;
loop {
match reader.read_event(buf) {
Ok(Event::End(ref e)) if e.name() == b"localeDisplayNames" => {
break;
}
Ok(Event::Start(ref e)) if e.name() == b"languages" => {
parse = true;
}
Ok(Event::End(ref e)) if e.name() == b"languages" => {
parse = false;
}
Ok(Event::Start(ref e)) if e.name() == b"language" && parse => {
let a = e
.attributes()
.filter_map(Result::ok)
.find(|a| a.key == b"type");
if let Some(a) = a {
if let Ok(lang) = reader.decode_without_bom(&a.value) {
language = Language::from_639_1(lang)
.or_else(|| Language::from_639_3(lang))
.or_else(|| Language::from_locale(lang));
}
}
}
Ok(Event::End(ref e)) if e.name() == b"language" && parse => {
language = None;
}
Ok(Event::Text(e)) if language.is_some() => {
if let Ok(text) = reader.decode_without_bom(&e) {
result.push(LocalDisplayName {
name: text.to_lowercase(),
lang: language.take().unwrap(),
});
}
}
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
result
match opt.cmd {
Command::Cldr(cmd) => cmd.run(),
Command::Glottolog(cmd) => cmd.run(),
Command::Merge(cmd) => cmd.run(),
}
}

108
txtlang-gen/src/merge.rs Normal file
View File

@@ -0,0 +1,108 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead, BufReader, Write};
use std::path::PathBuf;
use anyhow::Result;
use argh::FromArgs;
use isolang::Language;
#[derive(Debug)]
struct Entry {
language: Language,
source_languages: Vec<Language>,
sources: Vec<String>,
}
#[derive(FromArgs, PartialEq, Debug)]
/// Merge TSV files.
#[argh(subcommand, name = "merge")]
pub struct Merge {
#[argh(positional)]
files: Vec<PathBuf>,
}
impl Merge {
pub fn run(&self) -> Result<()> {
let mut entries = HashMap::new();
for file in &self.files {
let reader = File::open(&file).map(BufReader::new)?;
for line in reader.lines() {
let line = line?;
let parts = line.splitn(4, '\t').collect::<Vec<_>>();
let language = Language::from_639_3(parts[0]).unwrap();
let source_language = Language::from_639_3(parts[1]).unwrap();
let name = parts[2].to_string();
let source = parts[3].to_string();
if entries
.get(&name)
.map(|entry: &Entry| entry.language != language)
.unwrap_or(false)
{
// TODO Print or log this.
entries.remove(&name);
} else {
entries
.entry(name.clone())
.and_modify(|entry: &mut Entry| {
entry.source_languages.push(source_language);
entry.sources.push(source.clone());
})
.or_insert_with(|| Entry {
language,
source_languages: vec![source_language],
sources: vec![source],
});
}
}
}
let mut entries = entries
.into_iter()
.map(|(name, mut entry)| {
entry
.source_languages
.sort_by_key(|language| language.to_639_3());
entry.source_languages.dedup();
entry.sources.sort();
entry.sources.dedup();
(entry.language, name, entry.source_languages, entry.sources)
})
.collect::<Vec<_>>();
entries.sort_unstable_by_key(|(language, name, ..)| (language.to_639_3(), name.clone()));
let stdout = io::stdout();
let mut lock = stdout.lock();
for (language, name, source_language, sources) in entries {
assert!(!name.is_empty());
assert!(!name.contains('\n'));
assert!(!name.contains('\t'));
writeln!(
lock,
"{}\t{}\t{}\t{}",
language.to_639_3(),
name,
source_language
.iter()
.map(|language| language.to_639_3())
.collect::<Vec<_>>()
.join(", "),
sources.join(", ")
)?;
}
lock.flush()?;
Ok(())
}
}