chgk_txt2json/src/main.rs

277 lines
7.6 KiB
Rust
Raw Normal View History

2019-07-26 12:24:25 +03:00
extern crate encoding;
extern crate json;
2019-07-26 23:04:41 +03:00
extern crate rayon;
2019-07-26 12:24:25 +03:00
extern crate textstream;
extern crate zip;
use encoding::all::KOI8_R;
use encoding::DecoderTrap;
2019-07-26 23:04:41 +03:00
use rayon::prelude::*;
2019-07-26 12:24:25 +03:00
use std::fs;
2019-07-26 12:25:45 +03:00
use std::io;
2019-07-26 12:24:25 +03:00
use std::path::PathBuf;
use textstream::TextReader;
2019-07-26 23:04:41 +03:00
const BASE_FILENAME: &str = "./baza.zip";
2019-07-26 12:24:25 +03:00
#[derive(Debug, Clone, Copy)]
enum KeywordType {
2019-07-26 12:25:45 +03:00
Ignore,
2019-07-26 12:24:25 +03:00
Global,
QuestionPre,
QuestionStart,
QuestionContent,
2019-07-26 12:25:45 +03:00
CurrentScope,
}
#[derive(Debug, Clone, Copy)]
enum DataScope {
Global,
QuestionPre,
QuestionContent,
2019-07-26 12:24:25 +03:00
}
fn keyword_type(pattern: &str) -> KeywordType {
2019-07-26 12:25:45 +03:00
use KeywordType::*;
2019-07-26 12:24:25 +03:00
match pattern {
2019-07-26 12:25:45 +03:00
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " => QuestionStart,
"Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
2019-07-26 12:24:25 +03:00
}
}
struct Context {
// global output value
data: json::JsonValue,
// temp questions array
questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>,
// temp question value
cur_question: json::JsonValue,
// temp value for pre'question fields
cur_question_pre: json::JsonValue,
2019-07-26 12:25:45 +03:00
// scope for data fields
cur_scope: DataScope,
2019-07-26 12:24:25 +03:00
// curent json key
cur_tag: String,
// current json value
cur_content: Vec<String>,
// need to push temp question value if true
have_new_question: bool,
// prev. keyword type
last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed)
last_tag: String,
}
2019-07-26 12:25:45 +03:00
fn parse_file(file: impl io::Read) -> Result<json::JsonValue, Box<std::error::Error>> {
let buf = io::BufReader::new(file);
2019-07-26 12:24:25 +03:00
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
let patterns = vec![
"Чемпионат:",
2019-07-26 12:25:45 +03:00
"Пакет:",
2019-07-26 12:24:25 +03:00
"URL:",
2019-07-26 12:25:45 +03:00
"Ссылка:",
2019-07-26 12:24:25 +03:00
"Дата:",
"Редактор:",
2019-07-26 12:25:45 +03:00
"Обработан:",
"Копирайт:",
"Инфо:",
"Тема:",
2019-07-26 12:24:25 +03:00
"Вид:",
2019-07-26 12:25:45 +03:00
"Тип:",
2019-07-26 12:24:25 +03:00
"Тур:",
2019-07-26 12:25:45 +03:00
"Мета:",
2019-07-26 12:24:25 +03:00
"Вопрос ",
2019-07-26 12:25:45 +03:00
"Вопрос:",
2019-07-26 12:24:25 +03:00
"Ответ:",
"Зачет:",
"Источник:",
2019-07-26 12:25:45 +03:00
"Рейтинг:",
2019-07-26 12:24:25 +03:00
"Автор:",
"Комментарий:",
2019-07-26 12:25:45 +03:00
"Комментарии:",
2019-07-26 12:24:25 +03:00
];
// init context
let mut context = Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
2019-07-26 12:25:45 +03:00
cur_scope: DataScope::Global,
2019-07-26 12:24:25 +03:00
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
};
let mut ctx = &mut context;
for line in reader.lines() {
// ignore empty lines
let line_str = String::from(line.unwrap().trim());
let line_s = &line_str;
2019-07-26 12:49:25 +03:00
if line_s.is_empty() {
2019-07-26 12:24:25 +03:00
continue;
}
// find keywords
match patterns
.iter()
2019-07-26 12:49:25 +03:00
.find(|&&pattern| line_s.starts_with(pattern) && line_s.ends_with(':'))
2019-07-26 12:24:25 +03:00
{
Some(pattern) => {
2019-07-26 12:25:45 +03:00
use KeywordType::*;
2019-07-26 12:24:25 +03:00
ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(keyword_type(&pattern));
2019-07-26 12:49:25 +03:00
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
2019-07-26 12:24:25 +03:00
// remember question id
2019-07-26 12:49:25 +03:00
if let Some(QuestionStart) = ctx.cur_keyword_type {
ctx.cur_question_pre["id"] = line_s.replace(':', "").as_str().into();
2019-07-26 12:24:25 +03:00
};
// apply accumulated content when new keyword found
match ctx.last_keyword_type {
2019-07-26 12:25:45 +03:00
Some(Global) => {
ctx.cur_scope = DataScope::Global;
2019-07-26 12:24:25 +03:00
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
2019-07-26 12:25:45 +03:00
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
2019-07-26 12:24:25 +03:00
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
2019-07-26 12:25:45 +03:00
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
2019-07-26 12:24:25 +03:00
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push(ctx.cur_question.clone()).unwrap();
}
// prepare for read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
2019-07-26 12:25:45 +03:00
// ctx.cur_question_pre = json::JsonValue::new_object(); // uncomment => forget pre at new question
2019-07-26 12:24:25 +03:00
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
2019-07-26 12:25:45 +03:00
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
(match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
})[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
2019-07-26 12:24:25 +03:00
};
// clear content
ctx.cur_content.clear();
}
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(String::from(line_s));
}
}
}
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").as_str().into();
ctx.questions.push(ctx.cur_question.clone()).unwrap();
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
}
2019-07-26 23:19:13 +03:00
// split slice to a vector of slices
fn split_vec<'a, T>(src: &'a [T], num: usize) -> Vec<&'a [T]> {
2019-07-26 23:04:41 +03:00
let all_len = src.len();
let part_len = all_len / num;
let add_len = all_len % num;
let mut result = Vec::<&'a [T]>::new();
2019-07-26 23:19:13 +03:00
if 0 == part_len {
result.push(src);
2019-07-26 23:04:41 +03:00
return result;
}
for i in 0..num {
let size = if (num - 1) == i {
part_len + add_len
} else {
part_len
};
let start = part_len * i;
result.push(&src[start..(start + size)]);
}
result
}
fn process_files(files: Vec<PathBuf>) {
let zip_file = fs::File::open(BASE_FILENAME).unwrap();
2019-07-26 12:25:45 +03:00
let zip_reader = io::BufReader::new(zip_file);
2019-07-26 23:04:41 +03:00
let mut archive = zip::ZipArchive::new(zip_reader).unwrap();
2019-07-26 12:24:25 +03:00
2019-07-26 23:04:41 +03:00
files.iter().for_each(|name| {
let name_str = name.to_str().unwrap();
println!("{:}", name_str);
2019-07-26 12:24:25 +03:00
2019-07-26 23:04:41 +03:00
// parse txt file
let file = archive.by_name(name_str).unwrap();
let data = parse_file(file).unwrap();
// make output filename
2019-07-26 12:24:25 +03:00
let mut outfilename = PathBuf::from("./json");
outfilename.push(name);
outfilename.set_extension("json");
2019-07-26 23:04:41 +03:00
// save json to file
let mut outfile = fs::File::create(outfilename).unwrap();
data.write_pretty(&mut outfile, 1).unwrap();
});
}
fn main() -> Result<(), Box<std::error::Error>> {
// open archive just to list files
let zip_file = fs::File::open(BASE_FILENAME)?;
let zip_reader = io::BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader)?;
let source_files: Vec<PathBuf> = (0..archive.len())
.map(|i| archive.by_index(i).unwrap().sanitized_name())
.filter(|name| {
// skip files without "txt" extension
match name.extension() {
Some(ext) => match ext.to_str() {
Some(ext_str) if ext_str.eq_ignore_ascii_case("txt") => true,
_ => false, // extension is not valid unicode or not txt
},
_ => false, // no extension in filename
}
})
.collect();
drop(archive);
//split vector and iterate on it parts in parallel
split_vec(&source_files, rayon::current_num_threads())
.par_iter()
.for_each(|slice| {
let source_files_part = slice.to_vec();
process_files(source_files_part);
});
2019-07-26 12:24:25 +03:00
Ok(())
2019-07-25 12:02:25 +03:00
}