Collect data from glottolog as well as cldr.
This commit is contained in:
1
txtlang-gen/.gitignore
vendored
Normal file
1
txtlang-gen/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
*.tsv
|
||||
@@ -6,6 +6,10 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
argh = "0.1"
|
||||
bstr = "0.2"
|
||||
isolang = "1.0"
|
||||
quick-xml = "0.18"
|
||||
thiserror = "1.0"
|
||||
once_cell = "1.4"
|
||||
quick-xml = "0.19"
|
||||
regex = "1.3"
|
||||
walkdir = "2.3"
|
||||
|
||||
182
txtlang-gen/src/cldr.rs
Normal file
182
txtlang-gen/src/cldr.rs
Normal file
@@ -0,0 +1,182 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::io::{self, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::Result;
|
||||
use argh::FromArgs;
|
||||
use isolang::Language;
|
||||
use quick_xml::{events::Event, Reader};
|
||||
|
||||
#[derive(PartialEq, Debug)]
|
||||
struct LocalDisplayName {
|
||||
name: String,
|
||||
lang: Language,
|
||||
}
|
||||
|
||||
#[derive(FromArgs, PartialEq, Debug)]
|
||||
/// Parse CLDR data.
|
||||
#[argh(subcommand, name = "cldr")]
|
||||
pub struct Cldr {
|
||||
#[argh(positional)]
|
||||
files: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
impl Cldr {
|
||||
pub fn run(&self) -> Result<()> {
|
||||
let mut entries = Vec::new();
|
||||
|
||||
for file in &self.files {
|
||||
let mut reader = File::open(&file)
|
||||
.map(BufReader::new)
|
||||
.map(Reader::from_reader)?;
|
||||
|
||||
reader.trim_text(true);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let mut language = None;
|
||||
let mut results = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event(&mut buf) {
|
||||
Ok(Event::Start(ref e)) if e.name() == b"identity" => {
|
||||
buf.clear();
|
||||
|
||||
language = parse_language(&mut reader, &mut buf)
|
||||
}
|
||||
Ok(Event::Start(ref e)) if e.name() == b"localeDisplayNames" => {
|
||||
buf.clear();
|
||||
|
||||
results.extend(parse_locale_display_names(&mut reader, &mut buf));
|
||||
}
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
let source_language = language.unwrap_or(Language::Und);
|
||||
|
||||
for entry in results {
|
||||
entries.push((entry.lang, source_language, entry.name));
|
||||
}
|
||||
}
|
||||
|
||||
entries.sort_unstable_by_key(|(language, source_language, _)| {
|
||||
(language.to_639_3(), source_language.to_639_3())
|
||||
});
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut lock = stdout.lock();
|
||||
|
||||
for (language, source_language, name) in entries {
|
||||
assert!(!name.is_empty());
|
||||
assert!(!name.contains('\n'));
|
||||
assert!(!name.contains('\t'));
|
||||
|
||||
writeln!(
|
||||
lock,
|
||||
"{}\t{}\t{}\tcldr",
|
||||
language.to_639_3(),
|
||||
source_language.to_639_3(),
|
||||
name
|
||||
)?;
|
||||
}
|
||||
|
||||
lock.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_language(reader: &mut Reader<BufReader<File>>, buf: &mut Vec<u8>) -> Option<Language> {
|
||||
let mut language = None;
|
||||
|
||||
loop {
|
||||
match reader.read_event(buf) {
|
||||
Ok(Event::Empty(ref e)) if e.name() == b"language" => {
|
||||
let a = e
|
||||
.attributes()
|
||||
.filter_map(Result::ok)
|
||||
.find(|a| a.key == b"type");
|
||||
|
||||
if let Some(a) = a {
|
||||
if let Ok(lang) = reader.decode_without_bom(&a.value) {
|
||||
language = Language::from_639_1(lang)
|
||||
.or_else(|| Language::from_639_3(lang))
|
||||
.or_else(|| Language::from_locale(lang));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == b"identity" => {
|
||||
break;
|
||||
}
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
language
|
||||
}
|
||||
|
||||
fn parse_locale_display_names(
|
||||
reader: &mut Reader<BufReader<File>>,
|
||||
buf: &mut Vec<u8>,
|
||||
) -> Vec<LocalDisplayName> {
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut parse = false;
|
||||
let mut language = None;
|
||||
|
||||
loop {
|
||||
match reader.read_event(buf) {
|
||||
Ok(Event::End(ref e)) if e.name() == b"localeDisplayNames" => {
|
||||
break;
|
||||
}
|
||||
Ok(Event::Start(ref e)) if e.name() == b"languages" => {
|
||||
parse = true;
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == b"languages" => {
|
||||
parse = false;
|
||||
}
|
||||
Ok(Event::Start(ref e)) if e.name() == b"language" && parse => {
|
||||
let a = e
|
||||
.attributes()
|
||||
.filter_map(Result::ok)
|
||||
.find(|a| a.key == b"type");
|
||||
|
||||
if let Some(a) = a {
|
||||
if let Ok(lang) = reader.decode_without_bom(&a.value) {
|
||||
language = Language::from_639_1(lang)
|
||||
.or_else(|| Language::from_639_3(lang))
|
||||
.or_else(|| Language::from_locale(lang));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == b"language" && parse => {
|
||||
language = None;
|
||||
}
|
||||
Ok(Event::Text(e)) if language.is_some() => {
|
||||
if let Ok(text) = reader.decode_without_bom(&e) {
|
||||
result.push(LocalDisplayName {
|
||||
name: text.to_lowercase(),
|
||||
lang: language.take().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
259
txtlang-gen/src/glottolog.rs
Normal file
259
txtlang-gen/src/glottolog.rs
Normal file
@@ -0,0 +1,259 @@
|
||||
use std::fs;
|
||||
use std::io::{self, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use argh::FromArgs;
|
||||
use isolang::Language;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use walkdir::{DirEntry, WalkDir};
|
||||
|
||||
#[derive(FromArgs, PartialEq, Debug)]
|
||||
/// Parse Glottolog data.
|
||||
#[argh(subcommand, name = "glottolog")]
|
||||
pub struct Glottolog {
|
||||
#[argh(positional)]
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl Glottolog {
|
||||
pub fn run(&self) -> Result<()> {
|
||||
let files = if self.path.is_dir() {
|
||||
WalkDir::new(&self.path)
|
||||
.into_iter()
|
||||
.filter_map(Result::ok)
|
||||
.filter(is_match)
|
||||
.map(|e| e.path().to_owned())
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
vec![self.path.clone()]
|
||||
};
|
||||
|
||||
let mut entries = Vec::new();
|
||||
|
||||
for file in files {
|
||||
if let Some(entry) = Entry::from_path(&file) {
|
||||
for (name, source) in entry.alt_names {
|
||||
entries.push((entry.language, source.unwrap_or(Language::Und), name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
entries.sort_unstable_by_key(|(language, source_language, _)| {
|
||||
(language.to_639_3(), source_language.to_639_3())
|
||||
});
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut lock = stdout.lock();
|
||||
|
||||
for (language, source_language, name) in entries {
|
||||
assert!(!name.is_empty());
|
||||
assert!(!name.contains('\n'));
|
||||
assert!(!name.contains('\t'));
|
||||
|
||||
writeln!(
|
||||
lock,
|
||||
"{}\t{}\t{}\tglottolog",
|
||||
language.to_639_3(),
|
||||
source_language.to_639_3(),
|
||||
name
|
||||
)?;
|
||||
}
|
||||
|
||||
lock.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Entry {
|
||||
language: Language,
|
||||
alt_names: Vec<(String, Option<Language>)>,
|
||||
}
|
||||
|
||||
impl Entry {
|
||||
fn from_path(path: &Path) -> Option<Entry> {
|
||||
static RE_LEXVO: Lazy<Regex> = Lazy::new(|| Regex::new(r"(.*) \[([a-z]{2,3})\]").unwrap());
|
||||
|
||||
let text = fs::read_to_string(path).ok()?;
|
||||
|
||||
let data = ini::parse(&text)?;
|
||||
|
||||
if !data
|
||||
.get(&Some("core".to_string()))?
|
||||
.get("level")
|
||||
.map(|level| level == "language")
|
||||
.unwrap_or(false)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let language = data
|
||||
.get(&Some("core".to_string()))?
|
||||
.get("iso639-3")
|
||||
.and_then(|iso| Language::from_639_3(iso))?;
|
||||
|
||||
let mut alt_names = Vec::new();
|
||||
|
||||
for (source, names) in data.get(&Some("altnames".to_string()))? {
|
||||
if source == "lexvo" {
|
||||
let names = names.lines().map(|line| line.trim()).collect::<Vec<_>>();
|
||||
|
||||
for name in names {
|
||||
if let Some(cap) = RE_LEXVO.captures(name) {
|
||||
let source =
|
||||
Language::from_639_1(&cap[2]).or_else(|| Language::from_639_3(&cap[2]));
|
||||
|
||||
let name = cap[1].trim().to_lowercase();
|
||||
|
||||
if name.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
alt_names.push((name, source));
|
||||
} else {
|
||||
let name = name.trim().to_lowercase();
|
||||
|
||||
if name.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
alt_names.push((name, None));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some(Entry {
|
||||
language,
|
||||
alt_names,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match(entry: &DirEntry) -> bool {
|
||||
entry
|
||||
.file_name()
|
||||
.to_str()
|
||||
.map(|s| s == "md.ini")
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
mod ini {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use bstr::ByteSlice;
|
||||
|
||||
pub fn parse(
|
||||
input: impl AsRef<[u8]>,
|
||||
) -> Option<HashMap<Option<String>, HashMap<String, String>>> {
|
||||
let mut result: HashMap<Option<String>, HashMap<String, String>> = HashMap::new();
|
||||
let mut current_section = None;
|
||||
|
||||
let mut input = input.as_ref();
|
||||
|
||||
while !input.is_empty() {
|
||||
input = eat_whitespace(input);
|
||||
|
||||
if input.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
match input[0] {
|
||||
b'#' => {
|
||||
let (rest, _comment) = parse_comment(&input[1..])?;
|
||||
|
||||
input = rest;
|
||||
}
|
||||
b'[' => {
|
||||
let (rest, section) = parse_section(&input[1..])?;
|
||||
|
||||
input = rest;
|
||||
|
||||
current_section = Some(
|
||||
result
|
||||
.entry(Some(section.to_str_lossy().to_string()))
|
||||
.or_insert_with(HashMap::new),
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
let (rest, key, value) = parse_key_value(&input)?;
|
||||
|
||||
input = rest;
|
||||
|
||||
if current_section.is_none() {
|
||||
current_section = Some(result.entry(None).or_insert_with(HashMap::new));
|
||||
}
|
||||
|
||||
if let Some(current_section) = &mut current_section {
|
||||
current_section.insert(
|
||||
key.to_str_lossy().to_string(),
|
||||
value.to_str_lossy().to_string(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some(result)
|
||||
}
|
||||
|
||||
fn eat_whitespace(input: &[u8]) -> &[u8] {
|
||||
input.trim_start()
|
||||
}
|
||||
|
||||
fn parse_comment(input: &[u8]) -> Option<(&[u8], &[u8])> {
|
||||
let input = eat_whitespace(input);
|
||||
let end = input.find_byte(b'\n')?;
|
||||
|
||||
let comment = &input[..end];
|
||||
|
||||
Some((&input[end..], comment.trim()))
|
||||
}
|
||||
|
||||
fn parse_section(input: &[u8]) -> Option<(&[u8], &[u8])> {
|
||||
let input = eat_whitespace(input);
|
||||
let end = input.find_byte(b']')?;
|
||||
|
||||
let section = &input[..end];
|
||||
|
||||
Some((&input[end + 1..], section.trim()))
|
||||
}
|
||||
|
||||
fn parse_key_value(mut input: &[u8]) -> Option<(&[u8], &[u8], &[u8])> {
|
||||
let end = input.find_byte(b'=')?;
|
||||
let key = &input[..end];
|
||||
|
||||
input = &input[end + 1..];
|
||||
|
||||
let end = input.find_byte(b'\n')?;
|
||||
|
||||
if !input[..end].trim().is_empty() {
|
||||
let value = &input[..end];
|
||||
|
||||
Some((&input[end + 1..], key.trim(), value.trim()))
|
||||
} else {
|
||||
let start = end + 1;
|
||||
let mut idx = start;
|
||||
|
||||
loop {
|
||||
if input[idx] != b'\t' {
|
||||
break;
|
||||
}
|
||||
|
||||
idx += input[idx..].find_byte(b'\n')?;
|
||||
idx += 1;
|
||||
}
|
||||
|
||||
let value = if idx != start {
|
||||
&input[start..idx - 1]
|
||||
} else {
|
||||
&input[start..idx]
|
||||
};
|
||||
|
||||
Some((&input[idx..], key.trim(), value.trim()))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,159 +1,31 @@
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use anyhow::Result;
|
||||
use isolang::Language;
|
||||
use quick_xml::{events::Event, Reader};
|
||||
use thiserror::Error;
|
||||
use argh::FromArgs;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
enum Error {
|
||||
#[error("xml error")]
|
||||
Xml(#[from] quick_xml::Error),
|
||||
mod cldr;
|
||||
mod glottolog;
|
||||
mod merge;
|
||||
|
||||
#[derive(FromArgs, PartialEq, Debug)]
|
||||
/// txtlang-gen.
|
||||
struct Opt {
|
||||
#[argh(subcommand)]
|
||||
cmd: Command,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Debug)]
|
||||
struct LocalDisplayName {
|
||||
name: String,
|
||||
lang: Language,
|
||||
#[derive(FromArgs, PartialEq, Debug)]
|
||||
#[argh(subcommand)]
|
||||
enum Command {
|
||||
Cldr(cldr::Cldr),
|
||||
Glottolog(glottolog::Glottolog),
|
||||
Merge(merge::Merge),
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let inputs = env::args().skip(1);
|
||||
let opt: Opt = argh::from_env();
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for input in inputs {
|
||||
let mut reader = File::open(&input)
|
||||
.map(BufReader::new)
|
||||
.map(Reader::from_reader)?;
|
||||
|
||||
reader.trim_text(true);
|
||||
|
||||
let mut buf = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event(&mut buf) {
|
||||
Ok(Event::Start(ref e)) if e.name() == b"localeDisplayNames" => {
|
||||
buf.clear();
|
||||
|
||||
results.extend(parse::locale_display_names(&mut reader, &mut buf));
|
||||
}
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
}
|
||||
|
||||
let before = results.len();
|
||||
|
||||
results.sort_unstable_by(|a, b| a.name.cmp(&b.name));
|
||||
results.dedup();
|
||||
|
||||
let dedup = results.len();
|
||||
|
||||
let mut remove = Vec::new();
|
||||
|
||||
results.dedup_by(|a, b| {
|
||||
if a.name == b.name {
|
||||
remove.push(a.name.clone());
|
||||
}
|
||||
|
||||
a.name == b.name
|
||||
});
|
||||
|
||||
results.retain(|x| !remove.contains(&x.name));
|
||||
|
||||
let clean = results.len();
|
||||
|
||||
eprintln!("before={}", before);
|
||||
eprintln!("dedup={}", dedup);
|
||||
eprintln!("clean={}", clean);
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut lock = stdout.lock();
|
||||
|
||||
for result in results {
|
||||
assert!(!result.name.contains('\n'));
|
||||
assert!(!result.name.contains('\t'));
|
||||
|
||||
writeln!(lock, "{}\t{}", result.name, result.lang.to_639_3())
|
||||
.expect("failed to write to stdout");
|
||||
}
|
||||
|
||||
lock.flush().expect("failed to flush to stdout");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
mod parse {
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use anyhow::Result;
|
||||
use isolang::Language;
|
||||
use quick_xml::{events::Event, Reader};
|
||||
|
||||
use crate::LocalDisplayName;
|
||||
|
||||
pub(crate) fn locale_display_names(
|
||||
reader: &mut Reader<BufReader<File>>,
|
||||
buf: &mut Vec<u8>,
|
||||
) -> Vec<LocalDisplayName> {
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut parse = false;
|
||||
let mut language = None;
|
||||
|
||||
loop {
|
||||
match reader.read_event(buf) {
|
||||
Ok(Event::End(ref e)) if e.name() == b"localeDisplayNames" => {
|
||||
break;
|
||||
}
|
||||
Ok(Event::Start(ref e)) if e.name() == b"languages" => {
|
||||
parse = true;
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == b"languages" => {
|
||||
parse = false;
|
||||
}
|
||||
Ok(Event::Start(ref e)) if e.name() == b"language" && parse => {
|
||||
let a = e
|
||||
.attributes()
|
||||
.filter_map(Result::ok)
|
||||
.find(|a| a.key == b"type");
|
||||
|
||||
if let Some(a) = a {
|
||||
if let Ok(lang) = reader.decode_without_bom(&a.value) {
|
||||
language = Language::from_639_1(lang)
|
||||
.or_else(|| Language::from_639_3(lang))
|
||||
.or_else(|| Language::from_locale(lang));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::End(ref e)) if e.name() == b"language" && parse => {
|
||||
language = None;
|
||||
}
|
||||
Ok(Event::Text(e)) if language.is_some() => {
|
||||
if let Ok(text) = reader.decode_without_bom(&e) {
|
||||
result.push(LocalDisplayName {
|
||||
name: text.to_lowercase(),
|
||||
lang: language.take().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
_ => (),
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
result
|
||||
match opt.cmd {
|
||||
Command::Cldr(cmd) => cmd.run(),
|
||||
Command::Glottolog(cmd) => cmd.run(),
|
||||
Command::Merge(cmd) => cmd.run(),
|
||||
}
|
||||
}
|
||||
|
||||
108
txtlang-gen/src/merge.rs
Normal file
108
txtlang-gen/src/merge.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufRead, BufReader, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::Result;
|
||||
use argh::FromArgs;
|
||||
use isolang::Language;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Entry {
|
||||
language: Language,
|
||||
source_languages: Vec<Language>,
|
||||
sources: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(FromArgs, PartialEq, Debug)]
|
||||
/// Merge TSV files.
|
||||
#[argh(subcommand, name = "merge")]
|
||||
pub struct Merge {
|
||||
#[argh(positional)]
|
||||
files: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
impl Merge {
|
||||
pub fn run(&self) -> Result<()> {
|
||||
let mut entries = HashMap::new();
|
||||
|
||||
for file in &self.files {
|
||||
let reader = File::open(&file).map(BufReader::new)?;
|
||||
|
||||
for line in reader.lines() {
|
||||
let line = line?;
|
||||
|
||||
let parts = line.splitn(4, '\t').collect::<Vec<_>>();
|
||||
|
||||
let language = Language::from_639_3(parts[0]).unwrap();
|
||||
let source_language = Language::from_639_3(parts[1]).unwrap();
|
||||
let name = parts[2].to_string();
|
||||
let source = parts[3].to_string();
|
||||
|
||||
if entries
|
||||
.get(&name)
|
||||
.map(|entry: &Entry| entry.language != language)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
// TODO Print or log this.
|
||||
entries.remove(&name);
|
||||
} else {
|
||||
entries
|
||||
.entry(name.clone())
|
||||
.and_modify(|entry: &mut Entry| {
|
||||
entry.source_languages.push(source_language);
|
||||
entry.sources.push(source.clone());
|
||||
})
|
||||
.or_insert_with(|| Entry {
|
||||
language,
|
||||
source_languages: vec![source_language],
|
||||
sources: vec![source],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut entries = entries
|
||||
.into_iter()
|
||||
.map(|(name, mut entry)| {
|
||||
entry
|
||||
.source_languages
|
||||
.sort_by_key(|language| language.to_639_3());
|
||||
entry.source_languages.dedup();
|
||||
|
||||
entry.sources.sort();
|
||||
entry.sources.dedup();
|
||||
|
||||
(entry.language, name, entry.source_languages, entry.sources)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
entries.sort_unstable_by_key(|(language, name, ..)| (language.to_639_3(), name.clone()));
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut lock = stdout.lock();
|
||||
|
||||
for (language, name, source_language, sources) in entries {
|
||||
assert!(!name.is_empty());
|
||||
assert!(!name.contains('\n'));
|
||||
assert!(!name.contains('\t'));
|
||||
|
||||
writeln!(
|
||||
lock,
|
||||
"{}\t{}\t{}\t{}",
|
||||
language.to_639_3(),
|
||||
name,
|
||||
source_language
|
||||
.iter()
|
||||
.map(|language| language.to_639_3())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", "),
|
||||
sources.join(", ")
|
||||
)?;
|
||||
}
|
||||
|
||||
lock.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -7,7 +7,7 @@ use std::path::Path;
|
||||
use fst::MapBuilder;
|
||||
|
||||
fn main() -> Result<(), Box<dyn Error>> {
|
||||
let lines = File::open("languages.txt")
|
||||
let lines = File::open("languages.tsv")
|
||||
.map(BufReader::new)?
|
||||
.lines()
|
||||
.filter_map(Result::ok)
|
||||
@@ -18,10 +18,10 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||
let mut buf = [0; 8];
|
||||
|
||||
for line in &lines {
|
||||
let parts = line.splitn(2, '\t').collect::<Vec<_>>();
|
||||
let parts = line.splitn(4, '\t').collect::<Vec<_>>();
|
||||
|
||||
let name = parts[0];
|
||||
let language = parts[1];
|
||||
let language = parts[0];
|
||||
let name = parts[1];
|
||||
|
||||
buf[5..].copy_from_slice(language.as_bytes());
|
||||
let value = u64::from_be_bytes(buf);
|
||||
|
||||
67088
txtlang/languages.tsv
Normal file
67088
txtlang/languages.tsv
Normal file
File diff suppressed because it is too large
Load Diff
45803
txtlang/languages.txt
45803
txtlang/languages.txt
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user