chgk_txt2json/src/main.rs
2022-08-25 14:45:03 +03:00

304 lines
10 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

extern crate encoding;
extern crate json;
extern crate rayon;
extern crate textstream;
extern crate zip;
use encoding::all::KOI8_R;
use encoding::DecoderTrap;
use rayon::prelude::*;
use std::path::PathBuf;
use std::{fs, io};
use textstream::TextReader;
const BASE_FILENAME: &str = "baza.zip";
const OUTPUT_PATH: &str = "json";
#[derive(Debug, Clone, Copy)]
enum KeywordType {
Ignore,
Global,
QuestionPre,
QuestionStart,
QuestionContent,
CurrentScope,
}
#[derive(Debug, Clone, Copy)]
enum DataScope {
Global,
QuestionPre,
QuestionContent,
}
struct Context {
// global output value
data: json::JsonValue,
// temp questions array
questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>,
// temp question value
cur_question: json::JsonValue,
// temp value for pre'question fields
cur_question_pre: json::JsonValue,
// scope for data fields
cur_scope: DataScope,
// curent json key
cur_tag: String,
// current json value
cur_content: Vec<String>,
// need to push temp question value if true
have_new_question: bool,
// prev. keyword type
last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed)
last_tag: String,
}
// check questions before push
trait PushIfValid {
fn is_valid(&self) -> bool;
fn push_if_valid(&mut self, value: json::JsonValue);
}
impl PushIfValid for json::JsonValue {
fn is_valid(&self) -> bool {
self.has_key("Вопрос") && self.has_key("Ответ")
}
fn push_if_valid(&mut self, value: json::JsonValue) {
if value.is_valid() {
self.push(value).unwrap_or(())
}
}
}
impl Context {
fn new() -> Context {
Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
}
impl KeywordType {
fn from(pattern: &str) -> KeywordType {
use KeywordType::*;
match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
}
}
}
fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
let buf = io::BufReader::new(file);
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
let patterns = vec![
"Чемпионат:",
"Пакет:",
"URL:",
"Ссылка:",
"Дата:",
"Редактор:",
"Обработан:",
"Копирайт:",
"Инфо:",
"Тема:",
"Вид:",
"Тип:",
"Тур:",
"Мета:",
"Вопрос ",
"Вопрос:",
"Ответ:",
"Зачет:",
"Источник:",
"Рейтинг:",
"Автор:",
"Комментарий:",
"Комментарии:",
];
let mut context = Context::new();
let mut ctx = &mut context;
reader
.lines()
.map(|line| String::from(line.unwrap().trim()))
.filter(|line| !line.is_empty()) // ignore empty lines
.for_each(|line| {
match patterns
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
use KeywordType::*;
ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(KeywordType::from(&pattern));
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
// remember question id
if let Some(QuestionStart) = ctx.cur_keyword_type {
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
};
// apply accumulated content when new keyword found
match ctx.last_keyword_type {
Some(Global) => {
ctx.cur_scope = DataScope::Global;
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push_if_valid(ctx.cur_question.clone());
}
// prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
let scope_data = match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
};
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
};
// clear content
ctx.cur_content.clear();
}
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(line);
}
}
});
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
ctx.questions.push_if_valid(ctx.cur_question.clone());
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
}
// split slice to a vector of slices
fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> {
let part_len = src.len() / num;
let add_len = src.len() % num;
let mut result = Vec::<&'a [T]>::with_capacity(num);
if 0 == part_len {
result.push(src);
return result;
}
for i in 0..num {
let size = if (num - 1) == i {
part_len + add_len
} else {
part_len
};
let start = part_len * i;
result.push(&src[start..(start + size)]);
}
result
}
fn process_files(files: &&[PathBuf]) {
let zip_file = fs::File::open(BASE_FILENAME).unwrap();
let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
files.iter().for_each(|name| {
let name_str = name.to_str().unwrap();
// parse txt file
let file = archive.by_name(name_str).unwrap();
let data = parse_file(file).unwrap();
// make output filename
let mut outfilename = PathBuf::from(OUTPUT_PATH);
outfilename.push(name);
outfilename.set_extension("json");
// save json to file
let mut outfile = fs::File::create(outfilename).unwrap();
data.write_pretty(&mut outfile, 1).unwrap();
});
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
// open archive just to list files
let zip_file = fs::File::open(BASE_FILENAME)?;
let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader)?;
let source_files: Vec<PathBuf> = (0..archive.len())
.map(|i| archive.by_index(i).unwrap().mangled_name())
.filter(|name| {
// skip files without "txt" extension
match name.extension() {
Some(ext) => match ext.to_str() {
Some(ext_str) => ext_str.eq_ignore_ascii_case("txt"),
_ => false, // extension is not valid unicode or not txt
},
_ => false, // no extension in filename
}
})
.collect();
drop(archive);
// check output directory
let out_dir: PathBuf = OUTPUT_PATH.into();
if out_dir.is_file() {
return Err("output directory is file!".into());
}
else if !out_dir.exists() {
fs::create_dir_all(out_dir)?;
};
println!("processing {} files with {} threads...", source_files.len(), rayon::current_num_threads());
// split vector and process its parts in parallel
split_vec(&source_files, rayon::current_num_threads())
.par_iter()
.for_each(process_files);
println!("done");
Ok(())
}