This commit is contained in:
Dmitry Belyaev 2019-07-27 20:24:49 +03:00
parent befca99019
commit 44d2567419
Signed by: b4tman
GPG Key ID: 41A00BF15EA7E5F3

View File

@ -7,269 +7,269 @@ extern crate zip;
use encoding::all::KOI8_R; use encoding::all::KOI8_R;
use encoding::DecoderTrap; use encoding::DecoderTrap;
use rayon::prelude::*; use rayon::prelude::*;
use std::fs;
use std::io;
use std::path::PathBuf; use std::path::PathBuf;
use std::{fs, io};
use textstream::TextReader; use textstream::TextReader;
const BASE_FILENAME: &str = "./baza.zip"; const BASE_FILENAME: &str = "baza.zip";
const OUTPUT_PATH: &str = "json";
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum KeywordType { enum KeywordType {
Ignore, Ignore,
Global, Global,
QuestionPre, QuestionPre,
QuestionStart, QuestionStart,
QuestionContent, QuestionContent,
CurrentScope, CurrentScope,
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
enum DataScope { enum DataScope {
Global, Global,
QuestionPre, QuestionPre,
QuestionContent, QuestionContent,
}
fn keyword_type(pattern: &str) -> KeywordType {
use KeywordType::*;
match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " => QuestionStart,
"Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
}
} }
struct Context { struct Context {
// global output value // global output value
data: json::JsonValue, data: json::JsonValue,
// temp questions array // temp questions array
questions: json::JsonValue, questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>, cur_keyword_type: Option<KeywordType>,
// temp question value // temp question value
cur_question: json::JsonValue, cur_question: json::JsonValue,
// temp value for pre'question fields // temp value for pre'question fields
cur_question_pre: json::JsonValue, cur_question_pre: json::JsonValue,
// scope for data fields // scope for data fields
cur_scope: DataScope, cur_scope: DataScope,
// curent json key // curent json key
cur_tag: String, cur_tag: String,
// current json value // current json value
cur_content: Vec<String>, cur_content: Vec<String>,
// need to push temp question value if true // need to push temp question value if true
have_new_question: bool, have_new_question: bool,
// prev. keyword type // prev. keyword type
last_keyword_type: Option<KeywordType>, last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed) // prev. json key (used for store acummulated content when new keyword readed)
last_tag: String, last_tag: String,
} }
impl Context { impl Context {
fn new() -> Context { fn new() -> Context {
Context { Context {
data: json::JsonValue::new_object(), data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(), questions: json::JsonValue::new_array(),
cur_keyword_type: None, cur_keyword_type: None,
cur_question: json::JsonValue::new_object(), cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(), cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(), cur_tag: String::new(),
cur_content: Vec::<String>::new(), cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global, cur_scope: DataScope::Global,
have_new_question: false, have_new_question: false,
last_keyword_type: None, last_keyword_type: None,
last_tag: String::new(), last_tag: String::new(),
} }
} }
}
impl KeywordType {
fn from(pattern: &str) -> KeywordType {
use KeywordType::*;
match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " => QuestionStart,
"Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
}
}
} }
fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> { fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> {
let buf = io::BufReader::new(file); let buf = io::BufReader::new(file);
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore); let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
let patterns = vec![ let patterns = vec![
"Чемпионат:", "Чемпионат:",
"Пакет:", "Пакет:",
"URL:", "URL:",
"Ссылка:", "Ссылка:",
"Дата:", "Дата:",
"Редактор:", "Редактор:",
"Обработан:", "Обработан:",
"Копирайт:", "Копирайт:",
"Инфо:", "Инфо:",
"Тема:", "Тема:",
"Вид:", "Вид:",
"Тип:", "Тип:",
"Тур:", "Тур:",
"Мета:", "Мета:",
"Вопрос ", "Вопрос ",
"Вопрос:", "Вопрос:",
"Ответ:", "Ответ:",
"Зачет:", "Зачет:",
"Источник:", "Источник:",
"Рейтинг:", "Рейтинг:",
"Автор:", "Автор:",
"Комментарий:", "Комментарий:",
"Комментарии:", "Комментарии:",
]; ];
let mut context = Context::new();
let mut ctx = &mut context;
let mut context = Context::new(); reader
let mut ctx = &mut context; .lines()
.map(|line| String::from(line.unwrap().trim()))
.filter(|line| !line.is_empty()) // ignore empty lines
.for_each(|line| {
match patterns
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
use KeywordType::*;
reader ctx.last_keyword_type = ctx.cur_keyword_type;
.lines() ctx.last_tag = ctx.cur_tag.clone();
.map(|line| String::from(line.unwrap().trim())) ctx.cur_keyword_type = Some(KeywordType::from(&pattern));
.filter(|line| !line.is_empty()) // ignore empty lines ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
.for_each(|line| {
match patterns
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
use KeywordType::*;
ctx.last_keyword_type = ctx.cur_keyword_type; // remember question id
ctx.last_tag = ctx.cur_tag.clone(); if let Some(QuestionStart) = ctx.cur_keyword_type {
ctx.cur_keyword_type = Some(keyword_type(&pattern)); ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
ctx.cur_tag = pattern.replace(' ', "").replace(':', ""); };
// remember question id // apply accumulated content when new keyword found
if let Some(QuestionStart) = ctx.cur_keyword_type { match ctx.last_keyword_type {
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into(); Some(Global) => {
}; ctx.cur_scope = DataScope::Global;
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push(ctx.cur_question.clone()).unwrap();
}
// prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
let scope_data = match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
};
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
};
// clear content
ctx.cur_content.clear();
}
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(line);
}
}
});
// apply accumulated content when new keyword found // finish reading last question
match ctx.last_keyword_type { if ctx.have_new_question && !ctx.cur_content.is_empty() {
Some(Global) => { ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
ctx.cur_scope = DataScope::Global; ctx.questions.push(ctx.cur_question.clone()).unwrap();
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into() ctx.have_new_question = false;
} }
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push(ctx.cur_question.clone()).unwrap();
}
// prepare for read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
// ctx.cur_question_pre = json::JsonValue::new_object(); // uncomment => forget pre at new question
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
(match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
})[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
};
// clear content
ctx.cur_content.clear();
}
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(line);
}
}
});
// finish reading last question ctx.data["Вопросы"] = ctx.questions.clone();
if ctx.have_new_question && !ctx.cur_content.is_empty() { Ok(ctx.data.clone())
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").as_str().into();
ctx.questions.push(ctx.cur_question.clone()).unwrap();
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
} }
// split slice to a vector of slices // split slice to a vector of slices
fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> { fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> {
let all_len = src.len(); let part_len = src.len() / num;
let part_len = all_len / num; let add_len = src.len() % num;
let add_len = all_len % num; let mut result = Vec::<&'a [T]>::with_capacity(num);
let mut result = Vec::<&'a [T]>::new();
if 0 == part_len { if 0 == part_len {
result.push(src); result.push(src);
return result; return result;
} }
for i in 0..num { for i in 0..num {
let size = if (num - 1) == i { let size = if (num - 1) == i {
part_len + add_len part_len + add_len
} else { } else {
part_len part_len
}; };
let start = part_len * i; let start = part_len * i;
result.push(&src[start..(start + size)]); result.push(&src[start..(start + size)]);
} }
result result
} }
fn process_files(files: &&[PathBuf]) { fn process_files(files: &&[PathBuf]) {
let zip_file = fs::File::open(BASE_FILENAME).unwrap(); let zip_file = fs::File::open(BASE_FILENAME).unwrap();
let zip_reader = io::BufReader::new(zip_file); let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader).unwrap(); let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
files.iter().for_each(|name| { files.iter().for_each(|name| {
let name_str = name.to_str().unwrap(); let name_str = name.to_str().unwrap();
println!("{:}", name_str); println!("{:}", name_str);
// parse txt file // parse txt file
let file = archive.by_name(name_str).unwrap(); let file = archive.by_name(name_str).unwrap();
let data = parse_file(file).unwrap(); let data = parse_file(file).unwrap();
// make output filename // make output filename
let mut outfilename = PathBuf::from("./json"); let mut outfilename = PathBuf::from(OUTPUT_PATH);
outfilename.push(name); outfilename.push(name);
outfilename.set_extension("json"); outfilename.set_extension("json");
// save json to file // save json to file
let mut outfile = fs::File::create(outfilename).unwrap(); let mut outfile = fs::File::create(outfilename).unwrap();
data.write_pretty(&mut outfile, 1).unwrap(); data.write_pretty(&mut outfile, 1).unwrap();
}); });
} }
fn main() -> Result<(), Box<std::error::Error>> { fn main() -> Result<(), Box<std::error::Error>> {
// open archive just to list files // open archive just to list files
let zip_file = fs::File::open(BASE_FILENAME)?; let zip_file = fs::File::open(BASE_FILENAME)?;
let zip_reader = io::BufReader::new(zip_file); let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader)?; let mut archive = zip::ZipArchive::new(zip_reader)?;
let source_files: Vec<PathBuf> = (0..archive.len()) let source_files: Vec<PathBuf> = (0..archive.len())
.map(|i| archive.by_index(i).unwrap().sanitized_name()) .map(|i| archive.by_index(i).unwrap().sanitized_name())
.filter(|name| { .filter(|name| {
// skip files without "txt" extension // skip files without "txt" extension
match name.extension() { match name.extension() {
Some(ext) => match ext.to_str() { Some(ext) => match ext.to_str() {
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true, Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"),
_ => false, // extension is not valid unicode or not txt _ => false, // extension is not valid unicode or not txt
}, },
_ => false, // no extension in filename _ => false, // no extension in filename
} }
}) })
.collect(); .collect();
drop(archive); drop(archive);
// split vector and process its parts in parallel // split vector and process its parts in parallel
split_vec(&source_files, rayon::current_num_threads()) split_vec(&source_files, rayon::current_num_threads())
.par_iter() .par_iter()
.for_each(process_files); .for_each(process_files);
Ok(()) Ok(())
} }