chgk_txt2json/src/main.rs

278 lines
8.8 KiB
Rust
Raw Normal View History

2022-09-12 12:59:35 +00:00
extern crate async_zip;
2019-07-26 09:24:25 +00:00
extern crate json;
2022-09-12 12:59:35 +00:00
extern crate tokio;
2019-07-26 09:24:25 +00:00
2022-09-12 12:59:35 +00:00
use async_zip::read::fs::ZipFileReader;
2019-07-26 09:24:25 +00:00
use std::path::PathBuf;
2022-08-25 13:12:47 +00:00
use std::str::FromStr;
2022-09-12 13:38:23 +00:00
use tokio::{fs, task};
2022-09-12 12:59:35 +00:00
use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader};
2019-07-26 09:24:25 +00:00
2022-09-12 12:59:35 +00:00
const BASE_FILENAME: &str = "baza_utf8.zip";
2019-07-27 17:24:49 +00:00
const OUTPUT_PATH: &str = "json";
2019-07-26 20:04:41 +00:00
2019-07-26 09:24:25 +00:00
#[derive(Debug, Clone, Copy)]
enum KeywordType {
2019-07-27 17:24:49 +00:00
Ignore,
Global,
QuestionPre,
QuestionStart,
QuestionContent,
CurrentScope,
2019-07-26 09:25:45 +00:00
}
#[derive(Debug, Clone, Copy)]
enum DataScope {
2019-07-27 17:24:49 +00:00
Global,
QuestionPre,
QuestionContent,
2019-07-26 09:24:25 +00:00
}
struct Context {
2019-07-27 17:24:49 +00:00
// global output value
data: json::JsonValue,
// temp questions array
questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>,
// temp question value
cur_question: json::JsonValue,
// temp value for pre'question fields
cur_question_pre: json::JsonValue,
// scope for data fields
cur_scope: DataScope,
// curent json key
cur_tag: String,
// current json value
cur_content: Vec<String>,
// need to push temp question value if true
have_new_question: bool,
// prev. keyword type
last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed)
last_tag: String,
2019-07-26 09:24:25 +00:00
}
2019-07-30 18:15:20 +00:00
// check questions before push
trait PushIfValid {
fn is_valid(&self) -> bool;
fn push_if_valid(&mut self, value: json::JsonValue);
}
impl PushIfValid for json::JsonValue {
fn is_valid(&self) -> bool {
self.has_key("Вопрос") && self.has_key("Ответ")
}
fn push_if_valid(&mut self, value: json::JsonValue) {
if value.is_valid() {
self.push(value).unwrap_or(())
}
}
}
2019-07-27 08:30:40 +00:00
impl Context {
2019-07-27 17:24:49 +00:00
fn new() -> Context {
Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
}
2022-08-25 13:12:47 +00:00
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
2019-07-27 17:24:49 +00:00
use KeywordType::*;
2022-08-25 13:12:47 +00:00
Ok(match pattern {
2019-07-27 17:24:49 +00:00
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
2019-07-28 12:42:25 +00:00
"Вопрос " | "Вопрос:" => QuestionStart,
2019-07-27 17:24:49 +00:00
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
2022-08-25 13:12:47 +00:00
})
2019-07-27 17:24:49 +00:00
}
2019-07-27 08:30:40 +00:00
}
2022-09-12 12:59:35 +00:00
async fn parse_file(
entry_reader: impl AsyncReadExt + Unpin,
) -> Result<json::JsonValue, Box<dyn std::error::Error>> {
let buf_reader = BufReader::new(entry_reader);
let mut lines = buf_reader.lines();
2019-07-26 09:24:25 +00:00
2019-07-27 17:24:49 +00:00
let patterns = vec![
"Чемпионат:",
"Пакет:",
"URL:",
"Ссылка:",
"Дата:",
"Редактор:",
"Обработан:",
"Копирайт:",
"Инфо:",
"Тема:",
"Вид:",
"Тип:",
"Тур:",
"Мета:",
"Вопрос ",
"Вопрос:",
"Ответ:",
"Зачет:",
"Источник:",
"Рейтинг:",
"Автор:",
"Комментарий:",
"Комментарии:",
];
let mut context = Context::new();
let mut ctx = &mut context;
2019-07-26 09:24:25 +00:00
2022-09-12 12:59:35 +00:00
while let Some(line_r) = lines.next_line().await? {
let line = line_r.trim();
if line.is_empty() {
continue;
}
let line = line.to_string();
match patterns
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
use KeywordType::*;
ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(pattern.parse().unwrap());
ctx.cur_tag = pattern.replace(' ', "").replace(':', "");
// remember question id
if let Some(QuestionStart) = ctx.cur_keyword_type {
ctx.cur_question_pre["id"] = line.replace(':', "").as_str().into();
};
// apply accumulated content when new keyword found
match ctx.last_keyword_type {
Some(Global) => {
ctx.cur_scope = DataScope::Global;
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
Some(QuestionPre) => {
ctx.cur_scope = DataScope::QuestionPre;
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(QuestionStart) => {
ctx.cur_scope = DataScope::QuestionContent;
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push_if_valid(ctx.cur_question.clone());
2019-07-27 17:24:49 +00:00
}
2022-09-12 12:59:35 +00:00
// prepare to read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
Some(QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(CurrentScope) => {
// match value to store data
let scope_data = match ctx.cur_scope {
DataScope::Global => &mut ctx.data,
DataScope::QuestionPre => &mut ctx.cur_question_pre,
DataScope::QuestionContent => &mut ctx.cur_question,
};
scope_data[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
_ => (), //None or Ignore
};
// clear content
ctx.cur_content.clear();
2019-07-27 17:24:49 +00:00
}
2022-09-12 12:59:35 +00:00
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(line);
}
}
}
2019-07-26 09:24:25 +00:00
2019-07-27 17:24:49 +00:00
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").into();
2019-07-30 18:15:20 +00:00
ctx.questions.push_if_valid(ctx.cur_question.clone());
2019-07-27 17:24:49 +00:00
ctx.have_new_question = false;
}
2019-07-26 09:24:25 +00:00
2019-07-27 17:24:49 +00:00
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
2019-07-26 09:24:25 +00:00
}
2022-09-12 13:30:13 +00:00
async fn process_file(
archive: &ZipFileReader,
index: usize,
name: String,
) -> Result<(), Box<dyn std::error::Error>> {
2022-09-12 12:59:35 +00:00
let entry_reader = archive.entry_reader(index).await?;
2019-07-26 20:04:41 +00:00
2022-09-12 12:59:35 +00:00
// make output filename
let mut outfilename = PathBuf::from(OUTPUT_PATH);
outfilename.push(name);
outfilename.set_extension("json");
2022-08-25 12:48:52 +00:00
2022-09-12 12:59:35 +00:00
// save json to file
let new_data = parse_file(entry_reader).await?;
2022-09-12 13:38:23 +00:00
let data_str = task::spawn_blocking(move || {
new_data.pretty(2)
}).await?;
2019-07-26 09:24:25 +00:00
2022-09-12 12:59:35 +00:00
let mut outfile = fs::File::create(outfilename).await?;
outfile.write_all(data_str.as_bytes()).await?;
Ok(())
2019-07-26 20:04:41 +00:00
}
2022-09-12 12:59:35 +00:00
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
2019-07-27 17:24:49 +00:00
// open archive just to list files
2022-09-12 12:59:35 +00:00
let archive = ZipFileReader::new(String::from(BASE_FILENAME)).await?;
let source_files: Vec<(usize, String)> = archive
.entries()
.iter()
.enumerate()
.filter(|item| !item.1.dir())
.filter(|item| {
2019-07-27 17:24:49 +00:00
// skip files without "txt" extension
2022-09-12 12:59:35 +00:00
item.1.name().ends_with(".txt")
2019-07-27 17:24:49 +00:00
})
2022-09-12 12:59:35 +00:00
.map(|item| (item.0, item.1.name().to_string()))
2019-07-27 17:24:49 +00:00
.collect();
2022-08-25 12:50:37 +00:00
2022-08-25 11:45:03 +00:00
// check output directory
2022-09-12 12:59:35 +00:00
match fs::metadata(OUTPUT_PATH).await {
Err(_) => fs::create_dir_all(OUTPUT_PATH).await?,
Ok(x) if x.is_file() => return Err("output directory is file!".into()),
_ => (),
2022-08-25 11:45:03 +00:00
};
2022-08-25 12:50:37 +00:00
2022-09-12 12:59:35 +00:00
println!("processing {} files ...", source_files.len());
2022-09-12 13:30:13 +00:00
for i in source_files {
process_file(&archive, i.0, i.1).await?;
2022-09-12 12:59:35 +00:00
}
2022-08-25 12:50:37 +00:00
2022-08-25 11:45:03 +00:00
println!("done");
2019-07-27 17:24:49 +00:00
Ok(())
2019-07-25 09:02:25 +00:00
}