From f21c09514eaa1b4c9ce484de554c71153dbc9e5f Mon Sep 17 00:00:00 2001 From: Augusto Gunsch Date: Fri, 31 Dec 2021 14:22:05 -0300 Subject: [PATCH] Add form-of entries generator --- Cargo.lock | 9 +-- Cargo.toml | 3 +- src/database.rs | 144 +++++++++++++++++++++++++++++++++++++++--------- src/entry.rs | 61 +++++++++----------- src/main.rs | 2 + src/routes.rs | 9 +-- 6 files changed, 154 insertions(+), 74 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 91b0306..d6086e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -583,10 +583,11 @@ dependencies = [ "clap", "config", "futures", - "json", "reqwest", "rocket", "rusqlite", + "serde 1.0.132", + "serde_json", "tokio", ] @@ -632,12 +633,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "json" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd" - [[package]] name = "lazy_static" version = "1.4.0" diff --git a/Cargo.toml b/Cargo.toml index cee35db..ae49bb3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ rocket = { version = "0.5.0-rc.1", features = ["json"] } reqwest = "0.11" rusqlite = "0.25.3" futures = "0.3" -json = "0.12" config = "0.11" tokio = { version = "1", features = ["full"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/src/database.rs b/src/database.rs index 7394dce..4c705e9 100644 --- a/src/database.rs +++ b/src/database.rs @@ -1,11 +1,15 @@ use std::fs; use reqwest; -use rusqlite::Connection; +use rusqlite::{Connection, Transaction}; use rusqlite::params; +use serde_json::Value; +use serde_json::json; +use serde_json; use crate::language::Language; -use crate::entry::WiktionaryEntries; +use crate::entry::{WiktionaryEntries, WiktionaryEntry}; +use crate::entry::Form; /// A database of Wiktionary entries pub struct WordDb { @@ -24,64 +28,147 @@ impl WordDb { } pub fn clean_tables(&mut self, lang: &Language) { - let mut connection = self.connect(); - let transaction = connection.transaction().unwrap(); + let mut conn = self.connect(); + let transaction = conn.transaction().unwrap(); - transaction.execute(&format!("DROP TABLE IF EXISTS {}_words", &lang.code), []).unwrap(); - transaction.execute(&format!("DROP TABLE IF EXISTS {}_types", &lang.code), []).unwrap(); + transaction.execute(&format!("DROP TABLE IF EXISTS {0}_words", &lang.code), []).unwrap(); + transaction.execute(&format!("DROP TABLE IF EXISTS {0}_types", &lang.code), []).unwrap(); transaction.execute(&format!(" - CREATE TABLE {}_types ( + CREATE TABLE {0}_types ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, name TINYTEXT UNIQUE NOT NULL )", &lang.code), []).unwrap(); for type_ in &lang.types { transaction.execute(&format!(" - INSERT INTO {}_types ( name ) + INSERT INTO {0}_types ( name ) VALUES ( ? )", &lang.code), [type_]).unwrap(); } transaction.execute(&format!(" - CREATE TABLE {}_words ( + CREATE TABLE {0}_words ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, word TINYTEXT NOT NULL, type_id INTEGER NOT NULL, content MEDIUMTEXT NOT NULL, FOREIGN KEY (type_id) - REFERENCES {}_types (id) - )", &lang.code, &lang.code), []).unwrap(); + REFERENCES {0}_types (id) + )", &lang.code), []).unwrap(); transaction.execute(&format!(" CREATE INDEX word_index - ON {}_words (word) + ON {0}_words (word) ", &lang.code), []).unwrap(); transaction.commit().unwrap(); } - pub fn insert_entries(&mut self, lang: &Language, entries: WiktionaryEntries) { - let mut connection = self.connect(); - let transaction = connection.transaction().unwrap(); + pub fn insert_entry(&self, transaction: &Transaction, lang: &Language, entry: &WiktionaryEntry) { + transaction.execute(&format!(" + INSERT INTO {0}_words ( word, content, type_id ) + VALUES ( + ?, ?, + (SELECT id FROM {0}_types WHERE name = ?) + )", &lang.code), + params![entry.word, + entry.parsed_json.to_string(), + entry.type_] + ).unwrap(); + } - for entry in entries { - transaction.execute(&format!(" - INSERT INTO {}_words ( word, content, type_id ) - VALUES ( - ?, ?, - (SELECT id FROM {}_types WHERE name = ?) - )", &lang.code, &lang.code), - params![entry.word, - entry.parsed_json.to_string(), - entry.type_] - ).unwrap(); + pub fn insert_entries(&mut self, lang: &Language, entries: &WiktionaryEntries) { + let mut conn = self.connect(); + let transaction = conn.transaction().unwrap(); + + for entry in entries.iter() { + self.insert_entry(&transaction, lang, entry); } transaction.commit().unwrap(); } + /// Generate missing "form-of" entries + pub fn generate_entries(&mut self, lang: &Language, entries: &WiktionaryEntries) { + let mut conn = self.connect(); + let transaction = conn.transaction().unwrap(); + + let mut statement = transaction.prepare(&format!( + "SELECT {0}_words.content + FROM {0}_words + JOIN {0}_types + ON {0}_types.id = {0}_words.type_id + WHERE {0}_words.word = ? + AND {0}_types.name = ?", &lang.code) + ).unwrap(); + + for entry in entries.iter() { + if let Some(forms) = entry.parsed_json["forms"].as_array() { + let mut forms_vec: Vec
= Vec::new(); + + for form in forms { + let form: Form = serde_json::from_value(form.clone()).unwrap(); + forms_vec.push(form); + } + + forms_vec = forms_vec.into_iter() + .filter(|x| + match &x.source { + Some(src) => src == "Declension" || src == "Conjugation", + None => false + } + ).collect(); + + forms_vec.sort_by_key(|x| x.form.clone()); + + let forms_group = forms_vec.group_by(|a, b| a.form == b.form); + + for forms in forms_group.into_iter() { + let mut entries = statement.query([&forms[0].form, &entry.type_]).unwrap(); + + if let None = entries.next().unwrap() { + let mut senses: Vec = Vec::new(); + + for form in forms { + let mut tags = form.tags.clone(); + tags.push(String::from("form-of")); + tags.push(String::from("auto-generated")); + + senses.push(json!({ + "form_of": [ + { + "word": entry.word + } + ], + "glosses": [ + form.tags.join(" ") + ], + "tags": tags + })); + } + + let entry_json = json!({ + "pos": entry.type_.clone(), + "word": forms[0].form.clone(), + "senses": senses + }); + + let new_entry = WiktionaryEntry::new(forms[0].form.clone(), + entry.type_.clone(), + entry_json); + + self.insert_entry(&transaction, lang, &new_entry); + } + } + } + } + + drop(statement); + transaction.commit().unwrap(); + } + pub async fn upgrade_lang(&mut self, lang: &Language) { println!("Trying to read cached data..."); let cached_data = fs::read_to_string("Polish.json"); @@ -110,7 +197,10 @@ impl WordDb { let entries = WiktionaryEntries::parse_data(data); println!("Inserting data..."); - self.insert_entries(lang, entries); + self.insert_entries(lang, &entries); + + println!("Generating \"form-of\" entries..."); + self.generate_entries(lang, &entries); println!("Done"); } diff --git a/src/entry.rs b/src/entry.rs index 4458817..3f99360 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -1,14 +1,13 @@ use std::cmp; -use std::iter::IntoIterator; -use json::JsonValue::{Object, Short}; -use json::JsonValue; +use std::slice::Iter; +use serde_json::Value; +use serde::Deserialize; -#[derive (Clone)] -#[derive (Debug)] +#[derive (Clone, Debug)] pub struct WiktionaryEntry { pub word: String, pub type_: String, - pub parsed_json: JsonValue + pub parsed_json: Value, } impl cmp::PartialEq for WiktionaryEntry { @@ -33,29 +32,10 @@ impl cmp::Ord for WiktionaryEntry { impl WiktionaryEntry { pub fn parse(unparsed_json: &str) -> Self { - let json = json::parse(unparsed_json).unwrap(); + let json: Value = serde_json::from_str(unparsed_json).unwrap(); - let (word, type_) = match &json { - Object(o) => ( - match o.get("word") { - Some(w) => match w { - Short(s) => s.to_string(), - JsonValue::String(s) => s.clone(), - _ => panic!("Not a string: {}", w.pretty(8)) - }, - None => panic!("No field 'word': {}", o.pretty(8)) - }, - match o.get("pos") { - Some(w) => match w { - Short(s) => s.to_string(), - JsonValue::String(s) => s.clone(), - _ => panic!("Not a string: {}", w.pretty(8)) - }, - None => panic!("No field 'pos': {}", o.pretty(8)) - } - ), - _ => panic!("Not an object: {}", json.pretty(8)) - }; + let word = String::from(json["word"].as_str().unwrap()); + let type_ = String::from(json["pos"].as_str().unwrap()); Self { word, @@ -63,6 +43,14 @@ impl WiktionaryEntry { parsed_json: json } } + + pub fn new(word: String, type_: String, parsed_json: Value) -> Self { + Self { + word, + type_, + parsed_json + } + } } pub struct WiktionaryEntries(Vec); @@ -77,13 +65,16 @@ impl WiktionaryEntries { Self(entries) } -} -impl IntoIterator for WiktionaryEntries { - type Item = WiktionaryEntry; - type IntoIter = std::vec::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() + pub fn iter(&self) -> Iter { + self.0.iter() } } + +#[derive(Debug, Deserialize)] +pub struct Form { + pub form: String, + pub tags: Vec, + pub source: Option, +} + diff --git a/src/main.rs b/src/main.rs index 14117db..a20c977 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,5 @@ +#![feature(slice_group_by)] + //mod database; use rocket::routes; use rocket::fs::FileServer; diff --git a/src/routes.rs b/src/routes.rs index 8dad838..ddd5177 100644 --- a/src/routes.rs +++ b/src/routes.rs @@ -19,9 +19,9 @@ pub fn frontend() -> Option> { #[get("/langs//words/")] pub fn get_entries(db: &State, lang: &str, word: &str) -> status::Custom> { - let connection = db.connect(); + let conn = db.connect(); - let mut statement = connection.prepare(&format!( + let mut statement = conn.prepare(&format!( "SELECT content FROM {}_words WHERE word = ?", @@ -38,6 +38,7 @@ pub fn get_entries(db: &State, lang: &str, word: &str) -> status::Custom words.push_str(&content); words.push(','); } + // Remove last comma if words.pop().unwrap() == '[' { words.push('['); } @@ -48,9 +49,9 @@ pub fn get_entries(db: &State, lang: &str, word: &str) -> status::Custom #[get("/langs//words?&&")] pub fn get_entries_like(db: &State, lang: &str, like: &str, limit: usize, offset: usize) -> Json> { - let connection = db.connect(); + let conn = db.connect(); - let mut statement = connection.prepare(&format!( + let mut statement = conn.prepare(&format!( "SELECT word FROM {}_words WHERE word LIKE ?