refactor
This commit is contained in:
parent
befca99019
commit
44d2567419
444
src/main.rs
444
src/main.rs
@ -7,269 +7,269 @@ extern crate zip;
|
||||
use encoding::all::KOI8_R;
|
||||
use encoding::DecoderTrap;
|
||||
use rayon::prelude::*;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::{fs, io};
|
||||
use textstream::TextReader;
|
||||
|
||||
const BASE_FILENAME: &str = "./baza.zip";
|
||||
const BASE_FILENAME: &str = "baza.zip";
|
||||
const OUTPUT_PATH: &str = "json";
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum KeywordType {
|
||||
Ignore,
|
||||
Global,
|
||||
QuestionPre,
|
||||
QuestionStart,
|
||||
QuestionContent,
|
||||
CurrentScope,
|
||||
Ignore,
|
||||
Global,
|
||||
QuestionPre,
|
||||
QuestionStart,
|
||||
QuestionContent,
|
||||
CurrentScope,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum DataScope {
|
||||
Global,
|
||||
QuestionPre,
|
||||
QuestionContent,
|
||||
}
|
||||
|
||||
fn keyword_type(pattern: &str) -> KeywordType {
|
||||
use KeywordType::*;
|
||||
match pattern {
|
||||
"Мета:" => Ignore,
|
||||
"Чемпионат:" | "Пакет:" => Global,
|
||||
"Тур:" => QuestionPre,
|
||||
"Вопрос " => QuestionStart,
|
||||
"Вопрос:" => QuestionStart,
|
||||
"Ответ:" | "Зачет:" => QuestionContent,
|
||||
_ => CurrentScope,
|
||||
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
|
||||
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
|
||||
}
|
||||
Global,
|
||||
QuestionPre,
|
||||
QuestionContent,
|
||||
}
|
||||
|
||||
struct Context {
|
||||
// global output value
|
||||
data: json::JsonValue,
|
||||
// temp questions array
|
||||
questions: json::JsonValue,
|
||||
cur_keyword_type: Option<KeywordType>,
|
||||
// temp question value
|
||||
cur_question: json::JsonValue,
|
||||
// temp value for pre'question fields
|
||||
cur_question_pre: json::JsonValue,
|
||||
// scope for data fields
|
||||
cur_scope: DataScope,
|
||||
// curent json key
|
||||
cur_tag: String,
|
||||
// current json value
|
||||
cur_content: Vec<String>,
|
||||
// need to push temp question value if true
|
||||
have_new_question: bool,
|
||||
// prev. keyword type
|
||||
last_keyword_type: Option<KeywordType>,
|
||||
// prev. json key (used for store acummulated content when new keyword readed)
|
||||
last_tag: String,
|
||||
// global output value
|
||||
data: json::JsonValue,
|
||||
// temp questions array
|
||||
questions: json::JsonValue,
|
||||
cur_keyword_type: Option<KeywordType>,
|
||||
// temp question value
|
||||
cur_question: json::JsonValue,
|
||||
// temp value for pre'question fields
|
||||
cur_question_pre: json::JsonValue,
|
||||
// scope for data fields
|
||||
cur_scope: DataScope,
|
||||
// curent json key
|
||||
cur_tag: String,
|
||||
// current json value
|
||||
cur_content: Vec<String>,
|
||||
// need to push temp question value if true
|
||||
have_new_question: bool,
|
||||
// prev. keyword type
|
||||
last_keyword_type: Option<KeywordType>,
|
||||
// prev. json key (used for store acummulated content when new keyword readed)
|
||||
last_tag: String,
|
||||
}
|
||||
|
||||
impl Context {
|
||||
fn new() -> Context {
|
||||
Context {
|
||||
data: json::JsonValue::new_object(),
|
||||
questions: json::JsonValue::new_array(),
|
||||
cur_keyword_type: None,
|
||||
cur_question: json::JsonValue::new_object(),
|
||||
cur_question_pre: json::JsonValue::new_object(),
|
||||
cur_tag: String::new(),
|
||||
cur_content: Vec::<String>::new(),
|
||||
cur_scope: DataScope::Global,
|
||||
have_new_question: false,
|
||||
last_keyword_type: None,
|
||||
last_tag: String::new(),
|
||||
}
|
||||
}
|
||||
fn new() -> Context {
|
||||
Context {
|
||||
data: json::JsonValue::new_object(),
|
||||
questions: json::JsonValue::new_array(),
|
||||
cur_keyword_type: None,
|
||||
cur_question: json::JsonValue::new_object(),
|
||||
cur_question_pre: json::JsonValue::new_object(),
|
||||
cur_tag: String::new(),
|
||||
cur_content: Vec::<String>::new(),
|
||||
cur_scope: DataScope::Global,
|
||||
have_new_question: false,
|
||||
last_keyword_type: None,
|
||||
last_tag: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl KeywordType {
|
||||
fn from(pattern: &str) -> KeywordType {
|
||||
use KeywordType::*;
|
||||
match pattern {
|
||||
"Мета:" => Ignore,
|
||||
"Чемпионат:" | "Пакет:" => Global,
|
||||
"Тур:" => QuestionPre,
|
||||
"Вопрос " => QuestionStart,
|
||||
"Вопрос:" => QuestionStart,
|
||||
"Ответ:" | "Зачет:" => QuestionContent,
|
||||
_ => CurrentScope,
|
||||
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
|
||||
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> {
|
||||
let buf = io::BufReader::new(file);
|
||||
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
|
||||
let buf = io::BufReader::new(file);
|
||||
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
|
||||
|
||||
let patterns = vec![
|
||||
"Чемпионат:",
|
||||
"Пакет:",
|
||||
"URL:",
|
||||
"Ссылка:",
|
||||
"Дата:",
|
||||
"Редактор:",
|
||||
"Обработан:",
|
||||
"Копирайт:",
|
||||
"Инфо:",
|
||||
"Тема:",
|
||||
"Вид:",
|
||||
"Тип:",
|
||||
"Тур:",
|
||||
"Мета:",
|
||||
"Вопрос ",
|
||||
"Вопрос:",
|
||||
"Ответ:",
|
||||
"Зачет:",
|
||||
"Источник:",
|
||||
"Рейтинг:",
|
||||
"Автор:",
|
||||
"Комментарий:",
|
||||
"Комментарии:",
|
||||
];
|
||||
let patterns = vec![
|
||||
"Чемпионат:",
|
||||
"Пакет:",
|
||||
"URL:",
|
||||
"Ссылка:",
|
||||
"Дата:",
|
||||
"Редактор:",
|
||||
"Обработан:",
|
||||
"Копирайт:",
|
||||
"Инфо:",
|
||||
"Тема:",
|
||||
"Вид:",
|
||||
"Тип:",
|
||||
"Тур:",
|
||||
"Мета:",
|
||||
"Вопрос ",
|
||||
"Вопрос:",
|
||||
"Ответ:",
|
||||
"Зачет:",
|
||||
"Источник:",
|
||||
"Рейтинг:",
|
||||
"Автор:",
|
||||
"Комментарий:",
|
||||
"Комментарии:",
|
||||
];
|
||||
let mut context = Context::new();
|
||||
let mut ctx = &mut context;
|
||||
|
||||
let mut context = Context::new();
|
||||
let mut ctx = &mut context;
|
||||
reader
|
||||
.lines()
|
||||
.map(|line| String::from(line.unwrap().trim()))
|
||||
.filter(|line| !line.is_empty()) // ignore empty lines
|
||||
.for_each(|line| {
|
||||
match patterns
|
||||
.iter() // find keyword
|
||||
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
|
||||
{
|
||||
Some(pattern) => {
|
||||
use KeywordType::*;
|
||||
|
||||
reader
|
||||
.lines()
|
||||
.map(|line| String::from(line.unwrap().trim()))
|
||||
.filter(|line| !line.is_empty()) // ignore empty lines
|
||||
.for_each(|line| {
|
||||
match patterns
|
||||
.iter() // find keyword
|
||||
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
|
||||
{
|
||||
Some(pattern) => {
|
||||
use KeywordType::*;
|
||||
ctx.last_keyword_type = ctx.cur_keyword_type;
|
||||
ctx.last_tag = ctx.cur_tag.clone();
|
||||
ctx.cur_keyword_type = Some(KeywordType::from(&pattern));
|
||||
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
|
||||
|
||||
ctx.last_keyword_type = ctx.cur_keyword_type;
|
||||
ctx.last_tag = ctx.cur_tag.clone();
|
||||
ctx.cur_keyword_type = Some(keyword_type(&pattern));
|
||||
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
|
||||
// remember question id
|
||||
if let Some(QuestionStart) = ctx.cur_keyword_type {
|
||||
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
|
||||
};
|
||||
|
||||
// remember question id
|
||||
if let Some(QuestionStart) = ctx.cur_keyword_type {
|
||||
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
|
||||
};
|
||||
// apply accumulated content when new keyword found
|
||||
match ctx.last_keyword_type {
|
||||
Some(Global) => {
|
||||
ctx.cur_scope = DataScope::Global;
|
||||
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
|
||||
}
|
||||
Some(QuestionPre) => {
|
||||
ctx.cur_scope = DataScope::QuestionPre;
|
||||
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
}
|
||||
Some(QuestionStart) => {
|
||||
ctx.cur_scope = DataScope::QuestionContent;
|
||||
// store prev question before reading new
|
||||
if ctx.have_new_question {
|
||||
ctx.questions.push(ctx.cur_question.clone()).unwrap();
|
||||
}
|
||||
// prepare to read new question data with cur_question_pre values
|
||||
ctx.cur_question = ctx.cur_question_pre.clone();
|
||||
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
ctx.have_new_question = true;
|
||||
}
|
||||
Some(QuestionContent) => {
|
||||
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
}
|
||||
Some(CurrentScope) => {
|
||||
// match value to store data
|
||||
let scope_data = match ctx.cur_scope {
|
||||
DataScope::Global => &mut ctx.data,
|
||||
DataScope::QuestionPre => &mut ctx.cur_question_pre,
|
||||
DataScope::QuestionContent => &mut ctx.cur_question,
|
||||
};
|
||||
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
}
|
||||
_ => (), //None or Ignore
|
||||
};
|
||||
// clear content
|
||||
ctx.cur_content.clear();
|
||||
}
|
||||
None => {
|
||||
// accumulate content if line is not a keyword
|
||||
ctx.cur_content.push(line);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// apply accumulated content when new keyword found
|
||||
match ctx.last_keyword_type {
|
||||
Some(Global) => {
|
||||
ctx.cur_scope = DataScope::Global;
|
||||
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
|
||||
}
|
||||
Some(QuestionPre) => {
|
||||
ctx.cur_scope = DataScope::QuestionPre;
|
||||
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
}
|
||||
Some(QuestionStart) => {
|
||||
ctx.cur_scope = DataScope::QuestionContent;
|
||||
// store prev question before reading new
|
||||
if ctx.have_new_question {
|
||||
ctx.questions.push(ctx.cur_question.clone()).unwrap();
|
||||
}
|
||||
// prepare for read new question data with cur_question_pre values
|
||||
ctx.cur_question = ctx.cur_question_pre.clone();
|
||||
// ctx.cur_question_pre = json::JsonValue::new_object(); // uncomment => forget pre at new question
|
||||
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
ctx.have_new_question = true;
|
||||
}
|
||||
Some(QuestionContent) => {
|
||||
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
}
|
||||
Some(CurrentScope) => {
|
||||
// match value to store data
|
||||
(match ctx.cur_scope {
|
||||
DataScope::Global => &mut ctx.data,
|
||||
DataScope::QuestionPre => &mut ctx.cur_question_pre,
|
||||
DataScope::QuestionContent => &mut ctx.cur_question,
|
||||
})[&ctx.last_tag] = ctx.cur_content.join("\n").into();
|
||||
}
|
||||
_ => (), //None or Ignore
|
||||
};
|
||||
// clear content
|
||||
ctx.cur_content.clear();
|
||||
}
|
||||
None => {
|
||||
// accumulate content if line is not a keyword
|
||||
ctx.cur_content.push(line);
|
||||
}
|
||||
}
|
||||
});
|
||||
// finish reading last question
|
||||
if ctx.have_new_question && !ctx.cur_content.is_empty() {
|
||||
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
|
||||
ctx.questions.push(ctx.cur_question.clone()).unwrap();
|
||||
ctx.have_new_question = false;
|
||||
}
|
||||
|
||||
// finish reading last question
|
||||
if ctx.have_new_question && !ctx.cur_content.is_empty() {
|
||||
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").as_str().into();
|
||||
ctx.questions.push(ctx.cur_question.clone()).unwrap();
|
||||
ctx.have_new_question = false;
|
||||
}
|
||||
|
||||
ctx.data["Вопросы"] = ctx.questions.clone();
|
||||
Ok(ctx.data.clone())
|
||||
ctx.data["Вопросы"] = ctx.questions.clone();
|
||||
Ok(ctx.data.clone())
|
||||
}
|
||||
|
||||
// split slice to a vector of slices
|
||||
fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> {
|
||||
let all_len = src.len();
|
||||
let part_len = all_len / num;
|
||||
let add_len = all_len % num;
|
||||
let mut result = Vec::<&'a [T]>::new();
|
||||
let part_len = src.len() / num;
|
||||
let add_len = src.len() % num;
|
||||
let mut result = Vec::<&'a [T]>::with_capacity(num);
|
||||
|
||||
if 0 == part_len {
|
||||
result.push(src);
|
||||
return result;
|
||||
}
|
||||
for i in 0..num {
|
||||
let size = if (num - 1) == i {
|
||||
part_len + add_len
|
||||
} else {
|
||||
part_len
|
||||
};
|
||||
let start = part_len * i;
|
||||
result.push(&src[start..(start + size)]);
|
||||
}
|
||||
result
|
||||
if 0 == part_len {
|
||||
result.push(src);
|
||||
return result;
|
||||
}
|
||||
for i in 0..num {
|
||||
let size = if (num - 1) == i {
|
||||
part_len + add_len
|
||||
} else {
|
||||
part_len
|
||||
};
|
||||
let start = part_len * i;
|
||||
result.push(&src[start..(start + size)]);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn process_files(files: &&[PathBuf]) {
|
||||
let zip_file = fs::File::open(BASE_FILENAME).unwrap();
|
||||
let zip_reader = io::BufReader::new(zip_file);
|
||||
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
|
||||
let zip_file = fs::File::open(BASE_FILENAME).unwrap();
|
||||
let zip_reader = io::BufReader::new(zip_file);
|
||||
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
|
||||
|
||||
files.iter().for_each(|name| {
|
||||
let name_str = name.to_str().unwrap();
|
||||
println!("{:}", name_str);
|
||||
files.iter().for_each(|name| {
|
||||
let name_str = name.to_str().unwrap();
|
||||
println!("{:}", name_str);
|
||||
|
||||
// parse txt file
|
||||
let file = archive.by_name(name_str).unwrap();
|
||||
let data = parse_file(file).unwrap();
|
||||
// parse txt file
|
||||
let file = archive.by_name(name_str).unwrap();
|
||||
let data = parse_file(file).unwrap();
|
||||
|
||||
// make output filename
|
||||
let mut outfilename = PathBuf::from("./json");
|
||||
outfilename.push(name);
|
||||
outfilename.set_extension("json");
|
||||
// make output filename
|
||||
let mut outfilename = PathBuf::from(OUTPUT_PATH);
|
||||
outfilename.push(name);
|
||||
outfilename.set_extension("json");
|
||||
|
||||
// save json to file
|
||||
let mut outfile = fs::File::create(outfilename).unwrap();
|
||||
data.write_pretty(&mut outfile, 1).unwrap();
|
||||
});
|
||||
// save json to file
|
||||
let mut outfile = fs::File::create(outfilename).unwrap();
|
||||
data.write_pretty(&mut outfile, 1).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<std::error::Error>> {
|
||||
// open archive just to list files
|
||||
let zip_file = fs::File::open(BASE_FILENAME)?;
|
||||
let zip_reader = io::BufReader::new(zip_file);
|
||||
let mut archive = zip::ZipArchive::new(zip_reader)?;
|
||||
// open archive just to list files
|
||||
let zip_file = fs::File::open(BASE_FILENAME)?;
|
||||
let zip_reader = io::BufReader::new(zip_file);
|
||||
let mut archive = zip::ZipArchive::new(zip_reader)?;
|
||||
|
||||
let source_files: Vec<PathBuf> = (0..archive.len())
|
||||
.map(|i| archive.by_index(i).unwrap().sanitized_name())
|
||||
.filter(|name| {
|
||||
// skip files without "txt" extension
|
||||
match name.extension() {
|
||||
Some(ext) => match ext.to_str() {
|
||||
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true,
|
||||
_ => false, // extension is not valid unicode or not txt
|
||||
},
|
||||
_ => false, // no extension in filename
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
drop(archive);
|
||||
let source_files: Vec<PathBuf> = (0..archive.len())
|
||||
.map(|i| archive.by_index(i).unwrap().sanitized_name())
|
||||
.filter(|name| {
|
||||
// skip files without "txt" extension
|
||||
match name.extension() {
|
||||
Some(ext) => match ext.to_str() {
|
||||
Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"),
|
||||
_ => false, // extension is not valid unicode or not txt
|
||||
},
|
||||
_ => false, // no extension in filename
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
drop(archive);
|
||||
|
||||
// split vector and process its parts in parallel
|
||||
split_vec(&source_files, rayon::current_num_threads())
|
||||
.par_iter()
|
||||
.for_each(process_files);
|
||||
Ok(())
|
||||
// split vector and process its parts in parallel
|
||||
split_vec(&source_files, rayon::current_num_threads())
|
||||
.par_iter()
|
||||
.for_each(process_files);
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user