chgk_txt2json/src/main.rs

278 lines
8.8 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

extern crate async_zip;
extern crate json;
extern crate tokio;
use async_zip::read::fs::ZipFileReader;
use std::path::PathBuf;
use std::str::FromStr;
use tokio::{fs, task};
use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader};
const BASE_FILENAME: &str = "baza_utf8.zip";
const OUTPUT_PATH: &str = "json";
#[derive(Debug, Clone, Copy)]
enum KeywordType {
Ignore,
Global,
QuestionPre,
QuestionStart,
QuestionContent,
CurrentScope,
}
#[derive(Debug, Clone, Copy)]
enum DataScope {
Global,
QuestionPre,
QuestionContent,
}
struct Context {
// global output value
data: json::JsonValue,
// temp questions array
questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>,
// temp question value
cur_question: json::JsonValue,
// temp value for pre'question fields
cur_question_pre: json::JsonValue,
// scope for data fields
cur_scope: DataScope,
// curent json key
cur_tag: String,
// current json value
cur_content: Vec<String>,
// need to push temp question value if true
have_new_question: bool,
// prev. keyword type
last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed)
last_tag: String,
}
// check questions before push
trait PushIfValid {
fn is_valid(&self) -> bool;
fn push_if_valid(&mut self, value: json::JsonValue);
}
impl PushIfValid for json::JsonValue {
fn is_valid(&self) -> bool {
self.has_key("Вопрос") && self.has_key("Ответ")
}
fn push_if_valid(&mut self, value: json::JsonValue) {
if value.is_valid() {
self.push(value).unwrap_or(())
}
}
}
impl Context {
fn new() -> Context {
Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
}
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
async fn parse_file(
entry_reader: impl AsyncReadExt + Unpin,
) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
let buf_reader = BufReader::new(entry_reader);
let mut lines = buf_reader.lines();
let patterns = vec![
"Чемпионат:",
"Пакет:",
"URL:",
"Ссылка:",
"Дата:",
"Редактор:",
"Обработан:",
"Копирайт:",
"Инфо:",
"Тема:",
"Вид:",
"Тип:",
"Тур:",
"Мета:",
"Вопрос ",
"Вопрос:",
"Ответ:",
"Зачет:",
"Источник:",
"Рейтинг:",
"Автор:",
"Комментарий:",
"Комментарии:",
];
let mut context = Context::new();
let mut ctx = &mut context;
while let Some(line_r) = lines.next_line().await? {
let line = line_r.trim();
if line.is_empty() {
continue;
}
let line = line.to_string();
match patterns
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
use KeywordType::*;
ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(pattern.parse().unwrap());
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
// remember question id
if let Some(QuestionStart) = ctx.cur_keyword_type {
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
};
// apply accumulated content when new keyword found
match ctx.last_keyword_type {
Some(Global) => {
ctx.cur_scope = DataScope::Global;
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push_if_valid(ctx.cur_question.clone());
}
// prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
let scope_data = match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
};
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
};
// clear content
ctx.cur_content.clear();
}
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(line);
}
}
}
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
ctx.questions.push_if_valid(ctx.cur_question.clone());
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
}
async fn process_file(
archive: &ZipFileReader,
index: usize,
name: String,
) -> Result<(), Box<dyn std::error::Error>> {
let entry_reader = archive.entry_reader(index).await?;
// make output filename
let mut outfilename = PathBuf::from(OUTPUT_PATH);
outfilename.push(name);
outfilename.set_extension("json");
// save json to file
let new_data = parse_file(entry_reader).await?;
let data_str = task::spawn_blocking(move || {
new_data.pretty(2)
}).await?;
let mut outfile = fs::File::create(outfilename).await?;
outfile.write_all(data_str.as_bytes()).await?;
Ok(())
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// open archive just to list files
let archive = ZipFileReader::new(String::from(BASE_FILENAME)).await?;
let source_files: Vec<(usize, String)> = archive
.entries()
.iter()
.enumerate()
.filter(|item| !item.1.dir())
.filter(|item| {
// skip files without "txt" extension
item.1.name().ends_with(".txt")
})
.map(|item| (item.0, item.1.name().to_string()))
.collect();
// check output directory
match fs::metadata(OUTPUT_PATH).await {
Err(_) => fs::create_dir_all(OUTPUT_PATH).await?,
Ok(x) if x.is_file() => return Err("output directory is file!".into()),
_ => (),
};
println!("processing {} files ...", source_files.len());
for i in source_files {
process_file(&archive, i.0, i.1).await?;
}
println!("done");
Ok(())
}