more keywords + autoscope

This commit is contained in:
Dmitry Belyaev 2019-07-26 12:25:45 +03:00
parent cbea45413c
commit b4de387798
Signed by: b4tman
GPG Key ID: 41A00BF15EA7E5F3

View File

@ -6,27 +6,39 @@ extern crate zip;
use encoding::all::KOI8_R; use encoding::all::KOI8_R;
use encoding::DecoderTrap; use encoding::DecoderTrap;
use std::fs; use std::fs;
use std::io::BufReader; use std::io;
use std::io::Read;
use std::path::PathBuf; use std::path::PathBuf;
use textstream::TextReader; use textstream::TextReader;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum KeywordType { enum KeywordType {
Ignore,
Global, Global,
QuestionPre, QuestionPre,
QuestionStart, QuestionStart,
QuestionContent, QuestionContent,
CurrentScope,
}
#[derive(Debug, Clone, Copy)]
enum DataScope {
Global,
QuestionPre,
QuestionContent,
} }
fn keyword_type(pattern: &str) -> KeywordType { fn keyword_type(pattern: &str) -> KeywordType {
use KeywordType::*;
match pattern { match pattern {
"Чемпионат:" | "URL:" | "Дата:" | "Редактор:" | "Вид:" => { "Мета:" => Ignore,
KeywordType::Global "Чемпионат:" | "Пакет:" => Global,
} "Тур:" => QuestionPre,
"Тур:" => KeywordType::QuestionPre, "Вопрос " => QuestionStart,
"Вопрос " => KeywordType::QuestionStart, "Вопрос:" => QuestionStart,
_ => KeywordType::QuestionContent, "Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
} }
} }
@ -40,6 +52,8 @@ struct Context {
cur_question: json::JsonValue, cur_question: json::JsonValue,
// temp value for pre'question fields // temp value for pre'question fields
cur_question_pre: json::JsonValue, cur_question_pre: json::JsonValue,
// scope for data fields
cur_scope: DataScope,
// curent json key // curent json key
cur_tag: String, cur_tag: String,
// current json value // current json value
@ -52,23 +66,34 @@ struct Context {
last_tag: String, last_tag: String,
} }
fn parse_file<R: Read>(file: R) -> Result<json::JsonValue, Box<std::error::Error>> { fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> {
let buf = BufReader::new(file); let buf = io::BufReader::new(file);
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore); let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
let patterns = vec![ let patterns = vec![
"Чемпионат:", "Чемпионат:",
"Пакет:",
"URL:", "URL:",
"Ссылка:",
"Дата:", "Дата:",
"Редактор:", "Редактор:",
"Обработан:",
"Копирайт:",
"Инфо:",
"Тема:",
"Вид:", "Вид:",
"Тип:",
"Тур:", "Тур:",
"Мета:",
"Вопрос ", "Вопрос ",
"Вопрос:",
"Ответ:", "Ответ:",
"Зачет:", "Зачет:",
"Источник:", "Источник:",
"Рейтинг:",
"Автор:", "Автор:",
"Комментарий:", "Комментарий:",
"Комментарии:",
]; ];
// init context // init context
let mut context = Context { let mut context = Context {
@ -79,6 +104,7 @@ fn parse_file<R: Read>(file: R) -> Result<json::JsonValue, Box<std::error::Error
cur_question_pre: json::JsonValue::new_object(), cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(), cur_tag: String::new(),
cur_content: Vec::<String>::new(), cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false, have_new_question: false,
last_keyword_type: None, last_keyword_type: None,
last_tag: String::new(), last_tag: String::new(),
@ -98,6 +124,8 @@ fn parse_file<R: Read>(file: R) -> Result<json::JsonValue, Box<std::error::Error
.find(|&&pattern| line_s.starts_with(pattern) && line_s.ends_with(":")) .find(|&&pattern| line_s.starts_with(pattern) && line_s.ends_with(":"))
{ {
Some(pattern) => { Some(pattern) => {
use KeywordType::*;
ctx.last_keyword_type = ctx.cur_keyword_type; ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone(); ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(keyword_type(&pattern)); ctx.cur_keyword_type = Some(keyword_type(&pattern));
@ -105,35 +133,46 @@ fn parse_file<R: Read>(file: R) -> Result<json::JsonValue, Box<std::error::Error
// remember question id // remember question id
match ctx.cur_keyword_type { match ctx.cur_keyword_type {
Some(KeywordType::QuestionStart) => { Some(QuestionStart) => {
ctx.cur_question_pre["id"] = line_s.replace(":", "").as_str().into() ctx.cur_question_pre["id"] = line_s.replace(":", "").as_str().into();
} }
_ => (), _ => (),
}; };
// apply accumulated content when new keyword found // apply accumulated content when new keyword found
match ctx.last_keyword_type { match ctx.last_keyword_type {
Some(KeywordType::Global) => { Some(Global) => {
ctx.cur_scope = DataScope::Global;
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into() ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
} }
Some(KeywordType::QuestionPre) => { Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into(); ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
} }
Some(KeywordType::QuestionContent) => { Some(QuestionStart) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); ctx.cur_scope = DataScope::QuestionContent;
}
Some(KeywordType::QuestionStart) => {
// store prev question before reading new // store prev question before reading new
if ctx.have_new_question { if ctx.have_new_question {
ctx.questions.push(ctx.cur_question.clone()).unwrap(); ctx.questions.push(ctx.cur_question.clone()).unwrap();
} }
// prepare for read new question data with cur_question_pre values // prepare for read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone(); ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question_pre = json::JsonValue::new_object(); // ctx.cur_question_pre = json::JsonValue::new_object(); // uncomment => forget pre at new question
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true; ctx.have_new_question = true;
} }
None => (), Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
(match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
})[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
}; };
// clear content // clear content
ctx.cur_content.clear(); ctx.cur_content.clear();
@ -159,18 +198,21 @@ fn parse_file<R: Read>(file: R) -> Result<json::JsonValue, Box<std::error::Error
fn main() -> Result<(), Box<std::error::Error>> { fn main() -> Result<(), Box<std::error::Error>> {
let fname = std::path::Path::new("./baza.zip"); let fname = std::path::Path::new("./baza.zip");
let zip_file = fs::File::open(&fname)?; let zip_file = fs::File::open(&fname)?;
let zip_reader = BufReader::new(zip_file); let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader)?; let mut archive = zip::ZipArchive::new(zip_reader)?;
for i in 0..archive.len() { for i in 0..archive.len() {
let file = archive.by_index(i)?; let file = archive.by_index(i)?;
// FIXME
//if ! file.is_file() {
// continue;
//}
let name = file.sanitized_name(); let name = file.sanitized_name();
// skip files without "txt" extension
match name.extension() {
Some(ext) => match ext.to_str() {
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => (),
_ => continue, // extension is not valid unicode or not txt
},
_ => continue, // no extension in filename
}
println!("{}", name.as_path().display()); println!("{}", name.as_path().display());
let data: json::JsonValue = parse_file(file)?; let data: json::JsonValue = parse_file(file)?;
let mut outfilename = PathBuf::from("./json"); let mut outfilename = PathBuf::from("./json");
@ -182,7 +224,7 @@ fn main() -> Result<(), Box<std::error::Error>> {
data.write_pretty(&mut outfile, 1)?; data.write_pretty(&mut outfile, 1)?;
//data.write(&mut outfile)?; //data.write(&mut outfile)?;
//debug //debug
//break; break;
} }
Ok(()) Ok(())
} }