This commit is contained in:
Dmitry Belyaev 2019-07-27 20:24:49 +03:00
parent befca99019
commit 44d2567419
Signed by: b4tman
GPG Key ID: 41A00BF15EA7E5F3

View File

@ -7,12 +7,12 @@ extern crate zip;
use encoding::all::KOI8_R; use encoding::all::KOI8_R;
use encoding::DecoderTrap; use encoding::DecoderTrap;
use rayon::prelude::*; use rayon::prelude::*;
use std::fs;
use std::io;
use std::path::PathBuf; use std::path::PathBuf;
use std::{fs, io};
use textstream::TextReader; use textstream::TextReader;
const BASE_FILENAME: &str = "./baza.zip"; const BASE_FILENAME: &str = "baza.zip";
const OUTPUT_PATH: &str = "json";
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum KeywordType { enum KeywordType {
@ -31,21 +31,6 @@ enum DataScope {
QuestionContent, QuestionContent,
} }
fn keyword_type(pattern: &str) -> KeywordType {
use KeywordType::*;
match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " => QuestionStart,
"Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
}
}
struct Context { struct Context {
// global output value // global output value
data: json::JsonValue, data: json::JsonValue,
@ -88,6 +73,23 @@ impl Context {
} }
} }
impl KeywordType {
fn from(pattern: &str) -> KeywordType {
use KeywordType::*;
match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " => QuestionStart,
"Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
}
}
}
fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> { fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> {
let buf = io::BufReader::new(file); let buf = io::BufReader::new(file);
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore); let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
@ -117,7 +119,6 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
"Комментарий:", "Комментарий:",
"Комментарии:", "Комментарии:",
]; ];
let mut context = Context::new(); let mut context = Context::new();
let mut ctx = &mut context; let mut ctx = &mut context;
@ -135,7 +136,7 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
ctx.last_keyword_type = ctx.cur_keyword_type; ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone(); ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(keyword_type(&pattern)); ctx.cur_keyword_type = Some(KeywordType::from(&pattern));
ctx.cur_tag = pattern.replace(' ', "").replace(':', ""); ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
// remember question id // remember question id
@ -159,9 +160,8 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
if ctx.have_new_question { if ctx.have_new_question {
ctx.questions.push(ctx.cur_question.clone()).unwrap(); ctx.questions.push(ctx.cur_question.clone()).unwrap();
} }
// prepare for read new question data with cur_question_pre values // prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone(); ctx.cur_question = ctx.cur_question_pre.clone();
// ctx.cur_question_pre = json::JsonValue::new_object(); // uncomment => forget pre at new question
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true; ctx.have_new_question = true;
} }
@ -170,11 +170,12 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
} }
Some(CurrentScope) => { Some(CurrentScope) => {
// match value to store data // match value to store data
(match ctx.cur_scope { let scope_data = match ctx.cur_scope {
DataScope::Global => &mut ctx.data, DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre, DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question, DataScope::QuestionContent => &mut ctx.cur_question,
})[&ctx.last_tag] = ctx.cur_content.join("\n").into(); };
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
} }
_ => (), //None or Ignore _ => (), //None or Ignore
}; };
@ -190,7 +191,7 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
// finish reading last question // finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() { if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").as_str().into(); ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
ctx.questions.push(ctx.cur_question.clone()).unwrap(); ctx.questions.push(ctx.cur_question.clone()).unwrap();
ctx.have_new_question = false; ctx.have_new_question = false;
} }
@ -201,10 +202,9 @@ fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Er
// split slice to a vector of slices // split slice to a vector of slices
fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> { fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> {
let all_len = src.len(); let part_len = src.len() / num;
let part_len = all_len / num; let add_len = src.len() % num;
let add_len = all_len % num; let mut result = Vec::<&'a [T]>::with_capacity(num);
let mut result = Vec::<&'a [T]>::new();
if 0 == part_len { if 0 == part_len {
result.push(src); result.push(src);
@ -236,7 +236,7 @@ fn process_files(files: &&[PathBuf]) {
let data = parse_file(file).unwrap(); let data = parse_file(file).unwrap();
// make output filename // make output filename
let mut outfilename = PathBuf::from("./json"); let mut outfilename = PathBuf::from(OUTPUT_PATH);
outfilename.push(name); outfilename.push(name);
outfilename.set_extension("json"); outfilename.set_extension("json");
@ -258,7 +258,7 @@ fn main() -> Result<(), Box<std::error::Error>> {
// skip files without "txt" extension // skip files without "txt" extension
match name.extension() { match name.extension() {
Some(ext) => match ext.to_str() { Some(ext) => match ext.to_str() {
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true, Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"),
_ => false, // extension is not valid unicode or not txt _ => false, // extension is not valid unicode or not txt
}, },
_ => false, // no extension in filename _ => false, // no extension in filename