Add form-of entries generator
This commit is contained in:
parent
a458302388
commit
f21c09514e
|
@ -583,10 +583,11 @@ dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"config",
|
"config",
|
||||||
"futures",
|
"futures",
|
||||||
"json",
|
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rocket",
|
"rocket",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
|
"serde 1.0.132",
|
||||||
|
"serde_json",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -632,12 +633,6 @@ dependencies = [
|
||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "json"
|
|
||||||
version = "0.12.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "078e285eafdfb6c4b434e0d31e8cfcb5115b651496faca5749b88fafd4f23bfd"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lazy_static"
|
name = "lazy_static"
|
||||||
version = "1.4.0"
|
version = "1.4.0"
|
||||||
|
|
|
@ -12,6 +12,7 @@ rocket = { version = "0.5.0-rc.1", features = ["json"] }
|
||||||
reqwest = "0.11"
|
reqwest = "0.11"
|
||||||
rusqlite = "0.25.3"
|
rusqlite = "0.25.3"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
json = "0.12"
|
|
||||||
config = "0.11"
|
config = "0.11"
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
|
132
src/database.rs
132
src/database.rs
|
@ -1,11 +1,15 @@
|
||||||
use std::fs;
|
use std::fs;
|
||||||
|
|
||||||
use reqwest;
|
use reqwest;
|
||||||
use rusqlite::Connection;
|
use rusqlite::{Connection, Transaction};
|
||||||
use rusqlite::params;
|
use rusqlite::params;
|
||||||
|
use serde_json::Value;
|
||||||
|
use serde_json::json;
|
||||||
|
use serde_json;
|
||||||
|
|
||||||
use crate::language::Language;
|
use crate::language::Language;
|
||||||
use crate::entry::WiktionaryEntries;
|
use crate::entry::{WiktionaryEntries, WiktionaryEntry};
|
||||||
|
use crate::entry::Form;
|
||||||
|
|
||||||
/// A database of Wiktionary entries
|
/// A database of Wiktionary entries
|
||||||
pub struct WordDb {
|
pub struct WordDb {
|
||||||
|
@ -24,61 +28,144 @@ impl WordDb {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn clean_tables(&mut self, lang: &Language) {
|
pub fn clean_tables(&mut self, lang: &Language) {
|
||||||
let mut connection = self.connect();
|
let mut conn = self.connect();
|
||||||
let transaction = connection.transaction().unwrap();
|
let transaction = conn.transaction().unwrap();
|
||||||
|
|
||||||
transaction.execute(&format!("DROP TABLE IF EXISTS {}_words", &lang.code), []).unwrap();
|
transaction.execute(&format!("DROP TABLE IF EXISTS {0}_words", &lang.code), []).unwrap();
|
||||||
transaction.execute(&format!("DROP TABLE IF EXISTS {}_types", &lang.code), []).unwrap();
|
transaction.execute(&format!("DROP TABLE IF EXISTS {0}_types", &lang.code), []).unwrap();
|
||||||
|
|
||||||
transaction.execute(&format!("
|
transaction.execute(&format!("
|
||||||
CREATE TABLE {}_types (
|
CREATE TABLE {0}_types (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
|
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
|
||||||
name TINYTEXT UNIQUE NOT NULL
|
name TINYTEXT UNIQUE NOT NULL
|
||||||
)", &lang.code), []).unwrap();
|
)", &lang.code), []).unwrap();
|
||||||
|
|
||||||
for type_ in &lang.types {
|
for type_ in &lang.types {
|
||||||
transaction.execute(&format!("
|
transaction.execute(&format!("
|
||||||
INSERT INTO {}_types ( name )
|
INSERT INTO {0}_types ( name )
|
||||||
VALUES (
|
VALUES (
|
||||||
?
|
?
|
||||||
)", &lang.code), [type_]).unwrap();
|
)", &lang.code), [type_]).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
transaction.execute(&format!("
|
transaction.execute(&format!("
|
||||||
CREATE TABLE {}_words (
|
CREATE TABLE {0}_words (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
|
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
|
||||||
word TINYTEXT NOT NULL,
|
word TINYTEXT NOT NULL,
|
||||||
type_id INTEGER NOT NULL,
|
type_id INTEGER NOT NULL,
|
||||||
content MEDIUMTEXT NOT NULL,
|
content MEDIUMTEXT NOT NULL,
|
||||||
FOREIGN KEY (type_id)
|
FOREIGN KEY (type_id)
|
||||||
REFERENCES {}_types (id)
|
REFERENCES {0}_types (id)
|
||||||
)", &lang.code, &lang.code), []).unwrap();
|
)", &lang.code), []).unwrap();
|
||||||
|
|
||||||
transaction.execute(&format!("
|
transaction.execute(&format!("
|
||||||
CREATE INDEX word_index
|
CREATE INDEX word_index
|
||||||
ON {}_words (word)
|
ON {0}_words (word)
|
||||||
", &lang.code), []).unwrap();
|
", &lang.code), []).unwrap();
|
||||||
|
|
||||||
transaction.commit().unwrap();
|
transaction.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn insert_entries(&mut self, lang: &Language, entries: WiktionaryEntries) {
|
pub fn insert_entry(&self, transaction: &Transaction, lang: &Language, entry: &WiktionaryEntry) {
|
||||||
let mut connection = self.connect();
|
|
||||||
let transaction = connection.transaction().unwrap();
|
|
||||||
|
|
||||||
for entry in entries {
|
|
||||||
transaction.execute(&format!("
|
transaction.execute(&format!("
|
||||||
INSERT INTO {}_words ( word, content, type_id )
|
INSERT INTO {0}_words ( word, content, type_id )
|
||||||
VALUES (
|
VALUES (
|
||||||
?, ?,
|
?, ?,
|
||||||
(SELECT id FROM {}_types WHERE name = ?)
|
(SELECT id FROM {0}_types WHERE name = ?)
|
||||||
)", &lang.code, &lang.code),
|
)", &lang.code),
|
||||||
params![entry.word,
|
params![entry.word,
|
||||||
entry.parsed_json.to_string(),
|
entry.parsed_json.to_string(),
|
||||||
entry.type_]
|
entry.type_]
|
||||||
).unwrap();
|
).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn insert_entries(&mut self, lang: &Language, entries: &WiktionaryEntries) {
|
||||||
|
let mut conn = self.connect();
|
||||||
|
let transaction = conn.transaction().unwrap();
|
||||||
|
|
||||||
|
for entry in entries.iter() {
|
||||||
|
self.insert_entry(&transaction, lang, entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
transaction.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate missing "form-of" entries
|
||||||
|
pub fn generate_entries(&mut self, lang: &Language, entries: &WiktionaryEntries) {
|
||||||
|
let mut conn = self.connect();
|
||||||
|
let transaction = conn.transaction().unwrap();
|
||||||
|
|
||||||
|
let mut statement = transaction.prepare(&format!(
|
||||||
|
"SELECT {0}_words.content
|
||||||
|
FROM {0}_words
|
||||||
|
JOIN {0}_types
|
||||||
|
ON {0}_types.id = {0}_words.type_id
|
||||||
|
WHERE {0}_words.word = ?
|
||||||
|
AND {0}_types.name = ?", &lang.code)
|
||||||
|
).unwrap();
|
||||||
|
|
||||||
|
for entry in entries.iter() {
|
||||||
|
if let Some(forms) = entry.parsed_json["forms"].as_array() {
|
||||||
|
let mut forms_vec: Vec<Form> = Vec::new();
|
||||||
|
|
||||||
|
for form in forms {
|
||||||
|
let form: Form = serde_json::from_value(form.clone()).unwrap();
|
||||||
|
forms_vec.push(form);
|
||||||
|
}
|
||||||
|
|
||||||
|
forms_vec = forms_vec.into_iter()
|
||||||
|
.filter(|x|
|
||||||
|
match &x.source {
|
||||||
|
Some(src) => src == "Declension" || src == "Conjugation",
|
||||||
|
None => false
|
||||||
|
}
|
||||||
|
).collect();
|
||||||
|
|
||||||
|
forms_vec.sort_by_key(|x| x.form.clone());
|
||||||
|
|
||||||
|
let forms_group = forms_vec.group_by(|a, b| a.form == b.form);
|
||||||
|
|
||||||
|
for forms in forms_group.into_iter() {
|
||||||
|
let mut entries = statement.query([&forms[0].form, &entry.type_]).unwrap();
|
||||||
|
|
||||||
|
if let None = entries.next().unwrap() {
|
||||||
|
let mut senses: Vec<Value> = Vec::new();
|
||||||
|
|
||||||
|
for form in forms {
|
||||||
|
let mut tags = form.tags.clone();
|
||||||
|
tags.push(String::from("form-of"));
|
||||||
|
tags.push(String::from("auto-generated"));
|
||||||
|
|
||||||
|
senses.push(json!({
|
||||||
|
"form_of": [
|
||||||
|
{
|
||||||
|
"word": entry.word
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"glosses": [
|
||||||
|
form.tags.join(" ")
|
||||||
|
],
|
||||||
|
"tags": tags
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let entry_json = json!({
|
||||||
|
"pos": entry.type_.clone(),
|
||||||
|
"word": forms[0].form.clone(),
|
||||||
|
"senses": senses
|
||||||
|
});
|
||||||
|
|
||||||
|
let new_entry = WiktionaryEntry::new(forms[0].form.clone(),
|
||||||
|
entry.type_.clone(),
|
||||||
|
entry_json);
|
||||||
|
|
||||||
|
self.insert_entry(&transaction, lang, &new_entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(statement);
|
||||||
transaction.commit().unwrap();
|
transaction.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,7 +197,10 @@ impl WordDb {
|
||||||
let entries = WiktionaryEntries::parse_data(data);
|
let entries = WiktionaryEntries::parse_data(data);
|
||||||
|
|
||||||
println!("Inserting data...");
|
println!("Inserting data...");
|
||||||
self.insert_entries(lang, entries);
|
self.insert_entries(lang, &entries);
|
||||||
|
|
||||||
|
println!("Generating \"form-of\" entries...");
|
||||||
|
self.generate_entries(lang, &entries);
|
||||||
|
|
||||||
println!("Done");
|
println!("Done");
|
||||||
}
|
}
|
||||||
|
|
61
src/entry.rs
61
src/entry.rs
|
@ -1,14 +1,13 @@
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::iter::IntoIterator;
|
use std::slice::Iter;
|
||||||
use json::JsonValue::{Object, Short};
|
use serde_json::Value;
|
||||||
use json::JsonValue;
|
use serde::Deserialize;
|
||||||
|
|
||||||
#[derive (Clone)]
|
#[derive (Clone, Debug)]
|
||||||
#[derive (Debug)]
|
|
||||||
pub struct WiktionaryEntry {
|
pub struct WiktionaryEntry {
|
||||||
pub word: String,
|
pub word: String,
|
||||||
pub type_: String,
|
pub type_: String,
|
||||||
pub parsed_json: JsonValue
|
pub parsed_json: Value,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl cmp::PartialEq for WiktionaryEntry {
|
impl cmp::PartialEq for WiktionaryEntry {
|
||||||
|
@ -33,29 +32,10 @@ impl cmp::Ord for WiktionaryEntry {
|
||||||
|
|
||||||
impl WiktionaryEntry {
|
impl WiktionaryEntry {
|
||||||
pub fn parse(unparsed_json: &str) -> Self {
|
pub fn parse(unparsed_json: &str) -> Self {
|
||||||
let json = json::parse(unparsed_json).unwrap();
|
let json: Value = serde_json::from_str(unparsed_json).unwrap();
|
||||||
|
|
||||||
let (word, type_) = match &json {
|
let word = String::from(json["word"].as_str().unwrap());
|
||||||
Object(o) => (
|
let type_ = String::from(json["pos"].as_str().unwrap());
|
||||||
match o.get("word") {
|
|
||||||
Some(w) => match w {
|
|
||||||
Short(s) => s.to_string(),
|
|
||||||
JsonValue::String(s) => s.clone(),
|
|
||||||
_ => panic!("Not a string: {}", w.pretty(8))
|
|
||||||
},
|
|
||||||
None => panic!("No field 'word': {}", o.pretty(8))
|
|
||||||
},
|
|
||||||
match o.get("pos") {
|
|
||||||
Some(w) => match w {
|
|
||||||
Short(s) => s.to_string(),
|
|
||||||
JsonValue::String(s) => s.clone(),
|
|
||||||
_ => panic!("Not a string: {}", w.pretty(8))
|
|
||||||
},
|
|
||||||
None => panic!("No field 'pos': {}", o.pretty(8))
|
|
||||||
}
|
|
||||||
),
|
|
||||||
_ => panic!("Not an object: {}", json.pretty(8))
|
|
||||||
};
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
word,
|
word,
|
||||||
|
@ -63,6 +43,14 @@ impl WiktionaryEntry {
|
||||||
parsed_json: json
|
parsed_json: json
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn new(word: String, type_: String, parsed_json: Value) -> Self {
|
||||||
|
Self {
|
||||||
|
word,
|
||||||
|
type_,
|
||||||
|
parsed_json
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WiktionaryEntries(Vec<WiktionaryEntry>);
|
pub struct WiktionaryEntries(Vec<WiktionaryEntry>);
|
||||||
|
@ -77,13 +65,16 @@ impl WiktionaryEntries {
|
||||||
|
|
||||||
Self(entries)
|
Self(entries)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl IntoIterator for WiktionaryEntries {
|
pub fn iter(&self) -> Iter<WiktionaryEntry> {
|
||||||
type Item = WiktionaryEntry;
|
self.0.iter()
|
||||||
type IntoIter = std::vec::IntoIter<Self::Item>;
|
|
||||||
|
|
||||||
fn into_iter(self) -> Self::IntoIter {
|
|
||||||
self.0.into_iter()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Form {
|
||||||
|
pub form: String,
|
||||||
|
pub tags: Vec<String>,
|
||||||
|
pub source: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
#![feature(slice_group_by)]
|
||||||
|
|
||||||
//mod database;
|
//mod database;
|
||||||
use rocket::routes;
|
use rocket::routes;
|
||||||
use rocket::fs::FileServer;
|
use rocket::fs::FileServer;
|
||||||
|
|
|
@ -19,9 +19,9 @@ pub fn frontend() -> Option<content::Html<String>> {
|
||||||
|
|
||||||
#[get("/langs/<lang>/words/<word>")]
|
#[get("/langs/<lang>/words/<word>")]
|
||||||
pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom<content::Json<String>> {
|
pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom<content::Json<String>> {
|
||||||
let connection = db.connect();
|
let conn = db.connect();
|
||||||
|
|
||||||
let mut statement = connection.prepare(&format!(
|
let mut statement = conn.prepare(&format!(
|
||||||
"SELECT content
|
"SELECT content
|
||||||
FROM {}_words
|
FROM {}_words
|
||||||
WHERE word = ?",
|
WHERE word = ?",
|
||||||
|
@ -38,6 +38,7 @@ pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom
|
||||||
words.push_str(&content);
|
words.push_str(&content);
|
||||||
words.push(',');
|
words.push(',');
|
||||||
}
|
}
|
||||||
|
// Remove last comma
|
||||||
if words.pop().unwrap() == '[' {
|
if words.pop().unwrap() == '[' {
|
||||||
words.push('[');
|
words.push('[');
|
||||||
}
|
}
|
||||||
|
@ -48,9 +49,9 @@ pub fn get_entries(db: &State<WordDb>, lang: &str, word: &str) -> status::Custom
|
||||||
|
|
||||||
#[get("/langs/<lang>/words?<like>&<limit>&<offset>")]
|
#[get("/langs/<lang>/words?<like>&<limit>&<offset>")]
|
||||||
pub fn get_entries_like(db: &State<WordDb>, lang: &str, like: &str, limit: usize, offset: usize) -> Json<Vec<String>> {
|
pub fn get_entries_like(db: &State<WordDb>, lang: &str, like: &str, limit: usize, offset: usize) -> Json<Vec<String>> {
|
||||||
let connection = db.connect();
|
let conn = db.connect();
|
||||||
|
|
||||||
let mut statement = connection.prepare(&format!(
|
let mut statement = conn.prepare(&format!(
|
||||||
"SELECT word
|
"SELECT word
|
||||||
FROM {}_words
|
FROM {}_words
|
||||||
WHERE word LIKE ?
|
WHERE word LIKE ?
|
||||||
|
|
Loading…
Reference in New Issue