Add form-of entries generator

This commit is contained in:
Augusto Gunsch 2021-12-31 14:22:05 -03:00
parent a458302388
commit f21c09514e
No known key found for this signature in database
GPG Key ID: F7EEFE29825C72DC
6 changed files with 154 additions and 74 deletions

9
Cargo.lock generated
View File

@ -583,10 +583,11 @@ dependencies = [
"clap", "clap",
"config", "config",
"futures", "futures",
"json",
"reqwest", "reqwest",
"rocket", "rocket",
"rusqlite", "rusqlite",
"serde 1.0.132",
"serde_json",
"tokio", "tokio",
] ]
@ -632,12 +633,6 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "json"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd"
[[package]] [[package]]
name = "lazy_static" name = "lazy_static"
version = "1.4.0" version = "1.4.0"

View File

@ -12,6 +12,7 @@ rocket = { version = "0.5.0-rc.1", features = ["json"] }
reqwest = "0.11" reqwest = "0.11"
rusqlite = "0.25.3" rusqlite = "0.25.3"
futures = "0.3" futures = "0.3"
json = "0.12"
config = "0.11" config = "0.11"
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"

View File

@ -1,11 +1,15 @@
use std::fs; use std::fs;
use reqwest; use reqwest;
use rusqlite::Connection; use rusqlite::{Connection, Transaction};
use rusqlite::params; use rusqlite::params;
use serde_json::Value;
use serde_json::json;
use serde_json;
use crate::language::Language; use crate::language::Language;
use crate::entry::WiktionaryEntries; use crate::entry::{WiktionaryEntries, WiktionaryEntry};
use crate::entry::Form;
/// A database of Wiktionary entries /// A database of Wiktionary entries
pub struct WordDb { pub struct WordDb {
@ -24,61 +28,144 @@ impl WordDb {
} }
pub fn clean_tables(&mut self, lang: &Language) { pub fn clean_tables(&mut self, lang: &Language) {
let mut connection = self.connect(); let mut conn = self.connect();
let transaction = connection.transaction().unwrap(); let transaction = conn.transaction().unwrap();
transaction.execute(&format!("DROP TABLE IF EXISTS {}_words", &lang.code), []).unwrap(); transaction.execute(&format!("DROP TABLE IF EXISTS {0}_words", &lang.code), []).unwrap();
transaction.execute(&format!("DROP TABLE IF EXISTS {}_types", &lang.code), []).unwrap(); transaction.execute(&format!("DROP TABLE IF EXISTS {0}_types", &lang.code), []).unwrap();
transaction.execute(&format!(" transaction.execute(&format!("
CREATE TABLE {}_types ( CREATE TABLE {0}_types (
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
name TINYTEXT UNIQUE NOT NULL name TINYTEXT UNIQUE NOT NULL
)", &lang.code), []).unwrap(); )", &lang.code), []).unwrap();
for type_ in &lang.types { for type_ in &lang.types {
transaction.execute(&format!(" transaction.execute(&format!("
INSERT INTO {}_types ( name ) INSERT INTO {0}_types ( name )
VALUES ( VALUES (
? ?
)", &lang.code), [type_]).unwrap(); )", &lang.code), [type_]).unwrap();
} }
transaction.execute(&format!(" transaction.execute(&format!("
CREATE TABLE {}_words ( CREATE TABLE {0}_words (
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
word TINYTEXT NOT NULL, word TINYTEXT NOT NULL,
type_id INTEGER NOT NULL, type_id INTEGER NOT NULL,
content MEDIUMTEXT NOT NULL, content MEDIUMTEXT NOT NULL,
FOREIGN KEY (type_id) FOREIGN KEY (type_id)
REFERENCES {}_types (id) REFERENCES {0}_types (id)
)", &lang.code, &lang.code), []).unwrap(); )", &lang.code), []).unwrap();
transaction.execute(&format!(" transaction.execute(&format!("
CREATE INDEX word_index CREATE INDEX word_index
ON {}_words (word) ON {0}_words (word)
", &lang.code), []).unwrap(); ", &lang.code), []).unwrap();
transaction.commit().unwrap(); transaction.commit().unwrap();
} }
pub fn insert_entries(&mut self, lang: &Language, entries: WiktionaryEntries) { pub fn insert_entry(&self, transaction: &Transaction, lang: &Language, entry: &WiktionaryEntry) {
let mut connection = self.connect();
let transaction = connection.transaction().unwrap();
for entry in entries {
transaction.execute(&format!(" transaction.execute(&format!("
INSERT INTO {}_words ( word, content, type_id ) INSERT INTO {0}_words ( word, content, type_id )
VALUES ( VALUES (
?, ?, ?, ?,
(SELECT id FROM {}_types WHERE name = ?) (SELECT id FROM {0}_types WHERE name = ?)
)", &lang.code, &lang.code), )", &lang.code),
params![entry.word, params![entry.word,
entry.parsed_json.to_string(), entry.parsed_json.to_string(),
entry.type_] entry.type_]
).unwrap(); ).unwrap();
} }
pub fn insert_entries(&mut self, lang: &Language, entries: &WiktionaryEntries) {
let mut conn = self.connect();
let transaction = conn.transaction().unwrap();
for entry in entries.iter() {
self.insert_entry(&transaction, lang, entry);
}
transaction.commit().unwrap();
}
/// Generate missing "form-of" entries
pub fn generate_entries(&mut self, lang: &Language, entries: &WiktionaryEntries) {
let mut conn = self.connect();
let transaction = conn.transaction().unwrap();
let mut statement = transaction.prepare(&format!(
"SELECT {0}_words.content
FROM {0}_words
JOIN {0}_types
ON {0}_types.id = {0}_words.type_id
WHERE {0}_words.word = ?
AND {0}_types.name = ?", &lang.code)
).unwrap();
for entry in entries.iter() {
if let Some(forms) = entry.parsed_json["forms"].as_array() {
let mut forms_vec: Vec<Form> = Vec::new();
for form in forms {
let form: Form = serde_json::from_value(form.clone()).unwrap();
forms_vec.push(form);
}
forms_vec = forms_vec.into_iter()
.filter(|x|
match &x.source {
Some(src) => src == "Declension" || src == "Conjugation",
None => false
}
).collect();
forms_vec.sort_by_key(|x| x.form.clone());
let forms_group = forms_vec.group_by(|a, b| a.form == b.form);
for forms in forms_group.into_iter() {
let mut entries = statement.query([&forms[0].form, &entry.type_]).unwrap();
if let None = entries.next().unwrap() {
let mut senses: Vec<Value> = Vec::new();
for form in forms {
let mut tags = form.tags.clone();
tags.push(String::from("form-of"));
tags.push(String::from("auto-generated"));
senses.push(json!({
"form_of": [
{
"word": entry.word
}
],
"glosses": [
form.tags.join(" ")
],
"tags": tags
}));
}
let entry_json = json!({
"pos": entry.type_.clone(),
"word": forms[0].form.clone(),
"senses": senses
});
let new_entry = WiktionaryEntry::new(forms[0].form.clone(),
entry.type_.clone(),
entry_json);
self.insert_entry(&transaction, lang, &new_entry);
}
}
}
}
drop(statement);
transaction.commit().unwrap(); transaction.commit().unwrap();
} }
@ -110,7 +197,10 @@ impl WordDb {
let entries = WiktionaryEntries::parse_data(data); let entries = WiktionaryEntries::parse_data(data);
println!("Inserting data..."); println!("Inserting data...");
self.insert_entries(lang, entries); self.insert_entries(lang, &entries);
println!("Generating \"form-of\" entries...");
self.generate_entries(lang, &entries);
println!("Done"); println!("Done");
} }

View File

@ -1,14 +1,13 @@
use std::cmp; use std::cmp;
use std::iter::IntoIterator; use std::slice::Iter;
use json::JsonValue::{Object, Short}; use serde_json::Value;
use json::JsonValue; use serde::Deserialize;
#[derive (Clone)] #[derive (Clone, Debug)]
#[derive (Debug)]
pub struct WiktionaryEntry { pub struct WiktionaryEntry {
pub word: String, pub word: String,
pub type_: String, pub type_: String,
pub parsed_json: JsonValue pub parsed_json: Value,
} }
impl cmp::PartialEq for WiktionaryEntry { impl cmp::PartialEq for WiktionaryEntry {
@ -33,29 +32,10 @@ impl cmp::Ord for WiktionaryEntry {
impl WiktionaryEntry { impl WiktionaryEntry {
pub fn parse(unparsed_json: &str) -> Self { pub fn parse(unparsed_json: &str) -> Self {
let json = json::parse(unparsed_json).unwrap(); let json: Value = serde_json::from_str(unparsed_json).unwrap();
let (word, type_) = match &json { let word = String::from(json["word"].as_str().unwrap());
Object(o) => ( let type_ = String::from(json["pos"].as_str().unwrap());
match o.get("word") {
Some(w) => match w {
Short(s) => s.to_string(),
JsonValue::String(s) => s.clone(),
_ => panic!("Not a string: {}", w.pretty(8))
},
None => panic!("No field 'word': {}", o.pretty(8))
},
match o.get("pos") {
Some(w) => match w {
Short(s) => s.to_string(),
JsonValue::String(s) => s.clone(),
_ => panic!("Not a string: {}", w.pretty(8))
},
None => panic!("No field 'pos': {}", o.pretty(8))
}
),
_ => panic!("Not an object: {}", json.pretty(8))
};
Self { Self {
word, word,
@ -63,6 +43,14 @@ impl WiktionaryEntry {
parsed_json: json parsed_json: json
} }
} }
pub fn new(word: String, type_: String, parsed_json: Value) -> Self {
Self {
word,
type_,
parsed_json
}
}
} }
pub struct WiktionaryEntries(Vec<WiktionaryEntry>); pub struct WiktionaryEntries(Vec<WiktionaryEntry>);
@ -77,13 +65,16 @@ impl WiktionaryEntries {
Self(entries) Self(entries)
} }
pub fn iter(&self) -> Iter<WiktionaryEntry> {
self.0.iter()
}
} }
impl IntoIterator for WiktionaryEntries { #[derive(Debug, Deserialize)]
type Item = WiktionaryEntry; pub struct Form {
type IntoIter = std::vec::IntoIter<Self::Item>; pub form: String,
pub tags: Vec<String>,
pub source: Option<String>,
}
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}

View File

@ -1,3 +1,5 @@
#![feature(slice_group_by)]
//mod database; //mod database;
use rocket::routes; use rocket::routes;
use rocket::fs::FileServer; use rocket::fs::FileServer;

View File

@ -19,9 +19,9 @@ pub fn frontend() -> Option<content::Html<String>> {
#[get("/langs/<lang>/words/<word>")] #[get("/langs/<lang>/words/<word>")]
pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom<content::Json<String>> { pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom<content::Json<String>> {
let connection = db.connect(); let conn = db.connect();
let mut statement = connection.prepare(&format!( let mut statement = conn.prepare(&format!(
"SELECT content "SELECT content
FROM {}_words FROM {}_words
WHERE word = ?", WHERE word = ?",
@ -38,6 +38,7 @@ pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom
words.push_str(&content); words.push_str(&content);
words.push(','); words.push(',');
} }
// Remove last comma
if words.pop().unwrap() == '[' { if words.pop().unwrap() == '[' {
words.push('['); words.push('[');
} }
@ -48,9 +49,9 @@ pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom
#[get("/langs/<lang>/words?<like>&<limit>&<offset>")] #[get("/langs/<lang>/words?<like>&<limit>&<offset>")]
pub fn get_entries_like(db: &State<WordDb>, lang: &str, like: &str, limit: usize, offset: usize) -> Json<Vec<String>> { pub fn get_entries_like(db: &State<WordDb>, lang: &str, like: &str, limit: usize, offset: usize) -> Json<Vec<String>> {
let connection = db.connect(); let conn = db.connect();
let mut statement = connection.prepare(&format!( let mut statement = conn.prepare(&format!(
"SELECT word "SELECT word
FROM {}_words FROM {}_words
WHERE word LIKE ? WHERE word LIKE ?