diff --git a/src/main.rs b/src/main.rs index 418f6f0..8409df7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,269 +7,269 @@ extern crate zip; use encoding::all::KOI8_R; use encoding::DecoderTrap; use rayon::prelude::*; -use std::fs; -use std::io; use std::path::PathBuf; +use std::{fs, io}; use textstream::TextReader; -const BASE_FILENAME: &str = "./baza.zip"; +const BASE_FILENAME: &str = "baza.zip"; +const OUTPUT_PATH: &str = "json"; #[derive(Debug, Clone, Copy)] enum KeywordType { - Ignore, - Global, - QuestionPre, - QuestionStart, - QuestionContent, - CurrentScope, + Ignore, + Global, + QuestionPre, + QuestionStart, + QuestionContent, + CurrentScope, } #[derive(Debug, Clone, Copy)] enum DataScope { - Global, - QuestionPre, - QuestionContent, -} - -fn keyword_type(pattern: &str) -> KeywordType { - use KeywordType::*; - match pattern { - "Мета:" => Ignore, - "Чемпионат:" | "Пакет:" => Global, - "Тур:" => QuestionPre, - "Вопрос " => QuestionStart, - "Вопрос:" => QuestionStart, - "Ответ:" | "Зачет:" => QuestionContent, - _ => CurrentScope, - // "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" | - // "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:" - } + Global, + QuestionPre, + QuestionContent, } struct Context { - // global output value - data: json::JsonValue, - // temp questions array - questions: json::JsonValue, - cur_keyword_type: Option, - // temp question value - cur_question: json::JsonValue, - // temp value for pre'question fields - cur_question_pre: json::JsonValue, - // scope for data fields - cur_scope: DataScope, - // curent json key - cur_tag: String, - // current json value - cur_content: Vec, - // need to push temp question value if true - have_new_question: bool, - // prev. keyword type - last_keyword_type: Option, - // prev. json key (used for store acummulated content when new keyword readed) - last_tag: String, + // global output value + data: json::JsonValue, + // temp questions array + questions: json::JsonValue, + cur_keyword_type: Option, + // temp question value + cur_question: json::JsonValue, + // temp value for pre'question fields + cur_question_pre: json::JsonValue, + // scope for data fields + cur_scope: DataScope, + // curent json key + cur_tag: String, + // current json value + cur_content: Vec, + // need to push temp question value if true + have_new_question: bool, + // prev. keyword type + last_keyword_type: Option, + // prev. json key (used for store acummulated content when new keyword readed) + last_tag: String, } impl Context { - fn new() -> Context { - Context { - data: json::JsonValue::new_object(), - questions: json::JsonValue::new_array(), - cur_keyword_type: None, - cur_question: json::JsonValue::new_object(), - cur_question_pre: json::JsonValue::new_object(), - cur_tag: String::new(), - cur_content: Vec::::new(), - cur_scope: DataScope::Global, - have_new_question: false, - last_keyword_type: None, - last_tag: String::new(), - } - } + fn new() -> Context { + Context { + data: json::JsonValue::new_object(), + questions: json::JsonValue::new_array(), + cur_keyword_type: None, + cur_question: json::JsonValue::new_object(), + cur_question_pre: json::JsonValue::new_object(), + cur_tag: String::new(), + cur_content: Vec::::new(), + cur_scope: DataScope::Global, + have_new_question: false, + last_keyword_type: None, + last_tag: String::new(), + } + } +} + +impl KeywordType { + fn from(pattern: &str) -> KeywordType { + use KeywordType::*; + match pattern { + "Мета:" => Ignore, + "Чемпионат:" | "Пакет:" => Global, + "Тур:" => QuestionPre, + "Вопрос " => QuestionStart, + "Вопрос:" => QuestionStart, + "Ответ:" | "Зачет:" => QuestionContent, + _ => CurrentScope, + // "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" | + // "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:" + } + } } fn parse_file(file: impl io::Read) -> Result> { - let buf = io::BufReader::new(file); - let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore); + let buf = io::BufReader::new(file); + let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore); - let patterns = vec![ - "Чемпионат:", - "Пакет:", - "URL:", - "Ссылка:", - "Дата:", - "Редактор:", - "Обработан:", - "Копирайт:", - "Инфо:", - "Тема:", - "Вид:", - "Тип:", - "Тур:", - "Мета:", - "Вопрос ", - "Вопрос:", - "Ответ:", - "Зачет:", - "Источник:", - "Рейтинг:", - "Автор:", - "Комментарий:", - "Комментарии:", - ]; - - let mut context = Context::new(); - let mut ctx = &mut context; + let patterns = vec![ + "Чемпионат:", + "Пакет:", + "URL:", + "Ссылка:", + "Дата:", + "Редактор:", + "Обработан:", + "Копирайт:", + "Инфо:", + "Тема:", + "Вид:", + "Тип:", + "Тур:", + "Мета:", + "Вопрос ", + "Вопрос:", + "Ответ:", + "Зачет:", + "Источник:", + "Рейтинг:", + "Автор:", + "Комментарий:", + "Комментарии:", + ]; + let mut context = Context::new(); + let mut ctx = &mut context; - reader - .lines() - .map(|line| String::from(line.unwrap().trim())) - .filter(|line| !line.is_empty()) // ignore empty lines - .for_each(|line| { - match patterns - .iter() // find keyword - .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':')) - { - Some(pattern) => { - use KeywordType::*; + reader + .lines() + .map(|line| String::from(line.unwrap().trim())) + .filter(|line| !line.is_empty()) // ignore empty lines + .for_each(|line| { + match patterns + .iter() // find keyword + .find(|&&pattern| line.starts_with(pattern) && line.ends_with(':')) + { + Some(pattern) => { + use KeywordType::*; - ctx.last_keyword_type = ctx.cur_keyword_type; - ctx.last_tag = ctx.cur_tag.clone(); - ctx.cur_keyword_type = Some(keyword_type(&pattern)); - ctx.cur_tag = pattern.replace(' ', "").replace(':', ""); + ctx.last_keyword_type = ctx.cur_keyword_type; + ctx.last_tag = ctx.cur_tag.clone(); + ctx.cur_keyword_type = Some(KeywordType::from(&pattern)); + ctx.cur_tag = pattern.replace(' ', "").replace(':', ""); - // remember question id - if let Some(QuestionStart) = ctx.cur_keyword_type { - ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into(); - }; + // remember question id + if let Some(QuestionStart) = ctx.cur_keyword_type { + ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into(); + }; - // apply accumulated content when new keyword found - match ctx.last_keyword_type { - Some(Global) => { - ctx.cur_scope = DataScope::Global; - ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into() - } - Some(QuestionPre) => { - ctx.cur_scope = DataScope::QuestionPre; - ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into(); - } - Some(QuestionStart) => { - ctx.cur_scope = DataScope::QuestionContent; - // store prev question before reading new - if ctx.have_new_question { - ctx.questions.push(ctx.cur_question.clone()).unwrap(); - } - // prepare for read new question data with cur_question_pre values - ctx.cur_question = ctx.cur_question_pre.clone(); - // ctx.cur_question_pre = json::JsonValue::new_object(); // uncomment => forget pre at new question - ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); - ctx.have_new_question = true; - } - Some(QuestionContent) => { - ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); - } - Some(CurrentScope) => { - // match value to store data - (match ctx.cur_scope { - DataScope::Global => &mut ctx.data, - DataScope::QuestionPre => &mut ctx.cur_question_pre, - DataScope::QuestionContent => &mut ctx.cur_question, - })[&ctx.last_tag] = ctx.cur_content.join("\n").into(); - } - _ => (), //None or Ignore - }; - // clear content - ctx.cur_content.clear(); - } - None => { - // accumulate content if line is not a keyword - ctx.cur_content.push(line); - } - } - }); + // apply accumulated content when new keyword found + match ctx.last_keyword_type { + Some(Global) => { + ctx.cur_scope = DataScope::Global; + ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into() + } + Some(QuestionPre) => { + ctx.cur_scope = DataScope::QuestionPre; + ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into(); + } + Some(QuestionStart) => { + ctx.cur_scope = DataScope::QuestionContent; + // store prev question before reading new + if ctx.have_new_question { + ctx.questions.push(ctx.cur_question.clone()).unwrap(); + } + // prepare to read new question data with cur_question_pre values + ctx.cur_question = ctx.cur_question_pre.clone(); + ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); + ctx.have_new_question = true; + } + Some(QuestionContent) => { + ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into(); + } + Some(CurrentScope) => { + // match value to store data + let scope_data = match ctx.cur_scope { + DataScope::Global => &mut ctx.data, + DataScope::QuestionPre => &mut ctx.cur_question_pre, + DataScope::QuestionContent => &mut ctx.cur_question, + }; + scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into(); + } + _ => (), //None or Ignore + }; + // clear content + ctx.cur_content.clear(); + } + None => { + // accumulate content if line is not a keyword + ctx.cur_content.push(line); + } + } + }); - // finish reading last question - if ctx.have_new_question && !ctx.cur_content.is_empty() { - ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").as_str().into(); - ctx.questions.push(ctx.cur_question.clone()).unwrap(); - ctx.have_new_question = false; - } + // finish reading last question + if ctx.have_new_question && !ctx.cur_content.is_empty() { + ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into(); + ctx.questions.push(ctx.cur_question.clone()).unwrap(); + ctx.have_new_question = false; + } - ctx.data["Вопросы"] = ctx.questions.clone(); - Ok(ctx.data.clone()) + ctx.data["Вопросы"] = ctx.questions.clone(); + Ok(ctx.data.clone()) } // split slice to a vector of slices fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> { - let all_len = src.len(); - let part_len = all_len / num; - let add_len = all_len % num; - let mut result = Vec::<&'a [T]>::new(); + let part_len = src.len() / num; + let add_len = src.len() % num; + let mut result = Vec::<&'a [T]>::with_capacity(num); - if 0 == part_len { - result.push(src); - return result; - } - for i in 0..num { - let size = if (num - 1) == i { - part_len + add_len - } else { - part_len - }; - let start = part_len * i; - result.push(&src[start..(start + size)]); - } - result + if 0 == part_len { + result.push(src); + return result; + } + for i in 0..num { + let size = if (num - 1) == i { + part_len + add_len + } else { + part_len + }; + let start = part_len * i; + result.push(&src[start..(start + size)]); + } + result } fn process_files(files: &&[PathBuf]) { - let zip_file = fs::File::open(BASE_FILENAME).unwrap(); - let zip_reader = io::BufReader::new(zip_file); - let mut archive = zip::ZipArchive::new(zip_reader).unwrap(); + let zip_file = fs::File::open(BASE_FILENAME).unwrap(); + let zip_reader = io::BufReader::new(zip_file); + let mut archive = zip::ZipArchive::new(zip_reader).unwrap(); - files.iter().for_each(|name| { - let name_str = name.to_str().unwrap(); - println!("{:}", name_str); + files.iter().for_each(|name| { + let name_str = name.to_str().unwrap(); + println!("{:}", name_str); - // parse txt file - let file = archive.by_name(name_str).unwrap(); - let data = parse_file(file).unwrap(); + // parse txt file + let file = archive.by_name(name_str).unwrap(); + let data = parse_file(file).unwrap(); - // make output filename - let mut outfilename = PathBuf::from("./json"); - outfilename.push(name); - outfilename.set_extension("json"); + // make output filename + let mut outfilename = PathBuf::from(OUTPUT_PATH); + outfilename.push(name); + outfilename.set_extension("json"); - // save json to file - let mut outfile = fs::File::create(outfilename).unwrap(); - data.write_pretty(&mut outfile, 1).unwrap(); - }); + // save json to file + let mut outfile = fs::File::create(outfilename).unwrap(); + data.write_pretty(&mut outfile, 1).unwrap(); + }); } fn main() -> Result<(), Box> { - // open archive just to list files - let zip_file = fs::File::open(BASE_FILENAME)?; - let zip_reader = io::BufReader::new(zip_file); - let mut archive = zip::ZipArchive::new(zip_reader)?; + // open archive just to list files + let zip_file = fs::File::open(BASE_FILENAME)?; + let zip_reader = io::BufReader::new(zip_file); + let mut archive = zip::ZipArchive::new(zip_reader)?; - let source_files: Vec = (0..archive.len()) - .map(|i| archive.by_index(i).unwrap().sanitized_name()) - .filter(|name| { - // skip files without "txt" extension - match name.extension() { - Some(ext) => match ext.to_str() { - Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true, - _ => false, // extension is not valid unicode or not txt - }, - _ => false, // no extension in filename - } - }) - .collect(); - drop(archive); + let source_files: Vec = (0..archive.len()) + .map(|i| archive.by_index(i).unwrap().sanitized_name()) + .filter(|name| { + // skip files without "txt" extension + match name.extension() { + Some(ext) => match ext.to_str() { + Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"), + _ => false, // extension is not valid unicode or not txt + }, + _ => false, // no extension in filename + } + }) + .collect(); + drop(archive); - // split vector and process its parts in parallel - split_vec(&source_files, rayon::current_num_threads()) - .par_iter() - .for_each(process_files); - Ok(()) + // split vector and process its parts in parallel + split_vec(&source_files, rayon::current_num_threads()) + .par_iter() + .for_each(process_files); + Ok(()) }