chgk_txt2json/src/main.rs

371 lines
12 KiB
Rust
Raw Normal View History

2022-09-12 12:59:35 +00:00
extern crate async_zip;
2019-07-26 09:24:25 +00:00
extern crate json;
2022-09-12 12:59:35 +00:00
extern crate tokio;
2019-07-26 09:24:25 +00:00
2022-09-12 12:59:35 +00:00
use async_zip::read::fs::ZipFileReader;
2022-09-12 19:34:17 +00:00
use async_zip::write::{EntryOptions, ZipFileWriter};
use async_zip::Compression;
2019-07-26 09:24:25 +00:00
use std::path::PathBuf;
2022-08-25 13:12:47 +00:00
use std::str::FromStr;
2022-09-19 21:16:04 +00:00
use tokio::fs;
2022-10-01 09:42:32 +00:00
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader, BufWriter};
use tokio::sync::mpsc::{self, UnboundedReceiver, UnboundedSender};
2019-07-26 09:24:25 +00:00
2022-09-12 19:34:17 +00:00
const INPUT_FILENAME: &str = "baza_utf8.zip";
const OUTPUT_FILENAME: &str = "json.zip";
2022-09-20 12:34:39 +00:00
const OUTPUT_COMPRESSION: Compression = Compression::Zstd;
2019-07-26 20:04:41 +00:00
2019-07-26 09:24:25 +00:00
#[derive(Debug, Clone, Copy)]
enum KeywordType {
2019-07-27 17:24:49 +00:00
Ignore,
Global,
QuestionPre,
QuestionStart,
QuestionContent,
CurrentScope,
2019-07-26 09:25:45 +00:00
}
2022-09-20 11:31:37 +00:00
impl FromStr for KeywordType {
type Err = ();
fn from_str(pattern: &str) -> Result<Self, Self::Err> {
use KeywordType::*;
Ok(match pattern {
"Мета:" => Ignore,
"Чемпионат:" | "Пакет:" => Global,
"Тур:" => QuestionPre,
"Вопрос " | "Вопрос:" => QuestionStart,
"Ответ:" | "Зачет:" => QuestionContent,
_ => CurrentScope,
// "URL:" | "Ссылка:" | "Дата:" | "Обработан:" | "Автор:" | "Редактор:" | "Копирайт:" | "Инфо:" |
// "Тема:" | "Вид:" | "Тип:" | "Источник:" | "Рейтинг:" | "Комментарий:" | "Комментарии:"
})
}
}
2019-07-26 09:25:45 +00:00
#[derive(Debug, Clone, Copy)]
enum DataScope {
2019-07-27 17:24:49 +00:00
Global,
QuestionPre,
QuestionContent,
2019-07-26 09:24:25 +00:00
}
2022-09-20 11:31:37 +00:00
struct QuestionsParser {
2019-07-27 17:24:49 +00:00
// global output value
data: json::JsonValue,
// temp questions array
questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>,
// temp question value
cur_question: json::JsonValue,
// temp value for pre'question fields
cur_question_pre: json::JsonValue,
// scope for data fields
cur_scope: DataScope,
// curent json key
cur_tag: String,
// current json value
cur_content: Vec<String>,
// need to push temp question value if true
have_new_question: bool,
// prev. keyword type
last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed)
last_tag: String,
2019-07-26 09:24:25 +00:00
}
2022-09-20 11:31:37 +00:00
/// Text questions parser
impl QuestionsParser {
const PATTERNS: &'static [&'static str] = &[
2019-07-27 17:24:49 +00:00
"Чемпионат:",
"Пакет:",
"URL:",
"Ссылка:",
"Дата:",
"Редактор:",
"Обработан:",
"Копирайт:",
"Инфо:",
"Тема:",
"Вид:",
"Тип:",
"Тур:",
"Мета:",
"Вопрос ",
"Вопрос:",
"Ответ:",
"Зачет:",
"Источник:",
"Рейтинг:",
"Автор:",
"Комментарий:",
"Комментарии:",
];
2019-07-26 09:24:25 +00:00
2022-09-20 11:31:37 +00:00
/// create new parser
pub fn new() -> QuestionsParser {
QuestionsParser {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
cur_scope: DataScope::Global,
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
}
}
/// join current content lines
fn get_current_content(&self) -> String {
self.cur_content.join("\n")
}
/// clear current content
fn clear_current_content(&mut self) {
self.cur_content.clear()
}
/// add new line to current content
fn append_to_current_content(&mut self, line: String) {
self.cur_content.push(line);
}
/// check current question have required fields
fn is_current_question_valid(&self) -> bool {
self.cur_question.has_key("Вопрос") && self.cur_question.has_key("Ответ")
}
/// add current question to parsed array
fn add_cur_question(&mut self) {
if self.is_current_question_valid() {
2022-09-23 20:20:17 +00:00
let current = std::mem::replace(&mut self.cur_question, self.cur_question_pre.clone());
self.questions.push(current).unwrap()
2022-09-12 12:59:35 +00:00
}
2022-09-20 11:31:37 +00:00
}
/// set current content to last tag(keyword) to data scope
fn apply_content_to(&mut self, scope: DataScope) {
let content = self.get_current_content();
// match value to store data
let scope_data = match scope {
DataScope::Global => &mut self.data,
DataScope::QuestionPre => &mut self.cur_question_pre,
DataScope::QuestionContent => &mut self.cur_question,
};
scope_data[&self.last_tag] = content.into();
self.clear_current_content();
}
/// set current content to last tag(keyword) to current scope
fn apply_content_to_cur_scope(&mut self) {
self.apply_content_to(self.cur_scope);
}
/// set current scope
fn set_scope(&mut self, scope: DataScope) {
self.cur_scope = scope;
}
/// set current scope and set current content to last tag(keyword) to data scope
fn set_scope_and_apply(&mut self, scope: DataScope) {
self.set_scope(scope);
self.apply_content_to_cur_scope();
}
/// add last question (if have) and start collecting new one
fn start_new_question(&mut self) {
// store prev question before reading new
if self.have_new_question {
self.add_cur_question();
}
self.have_new_question = true;
}
2022-09-12 12:59:35 +00:00
2022-09-20 11:31:37 +00:00
/// check last tag(keyword) and set current content to corresponding data scope
fn apply_content_for_last_keyword(&mut self) {
// apply accumulated content when new keyword found
match self.last_keyword_type {
Some(KeywordType::Global) => {
self.set_scope_and_apply(DataScope::Global);
}
Some(KeywordType::QuestionPre) => {
self.set_scope_and_apply(DataScope::QuestionPre);
}
Some(KeywordType::QuestionStart) => {
self.start_new_question();
self.set_scope_and_apply(DataScope::QuestionContent);
}
Some(KeywordType::QuestionContent) => {
self.apply_content_to(DataScope::QuestionContent);
}
Some(KeywordType::CurrentScope) => {
self.apply_content_to_cur_scope();
}
_ => (), //None or Ignore
};
}
/// set current keyword(tag) and type as last, and set new as current
fn set_new_keyword(&mut self, keyword: &str) {
2022-09-23 19:50:42 +00:00
self.last_keyword_type =
std::mem::replace(&mut self.cur_keyword_type, Some(keyword.parse().unwrap()));
self.last_tag = std::mem::replace(
&mut self.cur_tag,
keyword.trim_end().trim_end_matches(':').to_string(),
);
2022-09-20 11:31:37 +00:00
}
/// if line matched keyword
fn on_keyword_match(&mut self, line: &str, keyword: &str) {
self.set_new_keyword(keyword);
// remember question id
if let Some(KeywordType::QuestionStart) = self.cur_keyword_type {
2022-09-23 19:37:17 +00:00
self.cur_question_pre["id"] = line.trim_end().trim_end_matches(':').into();
2022-09-20 11:31:37 +00:00
};
self.apply_content_for_last_keyword();
}
/// parse next line
pub fn parse_line(&mut self, line: &str) {
match QuestionsParser::PATTERNS
2022-09-12 12:59:35 +00:00
.iter() // find keyword
.find(|&&pattern| line.starts_with(pattern) && line.ends_with(':'))
{
Some(pattern) => {
2022-09-20 11:31:37 +00:00
self.on_keyword_match(line, pattern);
2019-07-27 17:24:49 +00:00
}
2022-09-12 12:59:35 +00:00
None => {
2022-09-20 11:31:37 +00:00
self.append_to_current_content(line.to_string());
2022-09-12 12:59:35 +00:00
}
}
}
2019-07-26 09:24:25 +00:00
2022-09-20 11:31:37 +00:00
/// finish parsing
pub fn finish(&mut self) {
if self.have_new_question && !self.cur_content.is_empty() {
self.cur_question[&self.cur_tag] = self.get_current_content().into();
self.add_cur_question();
self.clear_current_content();
self.have_new_question = false;
}
2022-09-23 19:50:42 +00:00
self.data["Вопросы"] = std::mem::replace(&mut self.questions, json::JsonValue::new_array());
2022-09-20 11:31:37 +00:00
}
/// get parsed data
pub fn get_parsed(self) -> json::JsonValue {
self.data
}
}
2022-10-01 09:42:32 +00:00
#[derive(Debug)]
struct FileText {
name: String,
text: String,
2019-07-26 09:24:25 +00:00
}
#[derive(Debug)]
2022-10-01 09:42:32 +00:00
enum TextReaderMessage {
NextLine(String),
EndOfFile(String),
2019-07-26 20:04:41 +00:00
}
2022-09-20 11:35:47 +00:00
/// read txt files from zip and convert to json
2022-10-01 09:42:32 +00:00
async fn zip_text_reader(tx: UnboundedSender<TextReaderMessage>) {
2019-07-27 17:24:49 +00:00
// open archive just to list files
2022-10-01 09:42:32 +00:00
let archive = ZipFileReader::new(INPUT_FILENAME).await.expect("open zip");
2022-09-12 12:59:35 +00:00
2022-09-19 21:12:49 +00:00
let source_files = archive
2022-09-12 12:59:35 +00:00
.entries()
.iter()
.enumerate()
2022-09-12 19:34:17 +00:00
.filter(|(_, entry)| !entry.dir())
.filter(|(_, entry)| {
2019-07-27 17:24:49 +00:00
// skip files without "txt" extension
2022-09-12 19:34:17 +00:00
entry.name().ends_with(".txt")
2019-07-27 17:24:49 +00:00
})
2022-09-19 21:12:49 +00:00
.map(|(index, entry)| (index, entry.name().to_string()));
//
for (index, name) in source_files {
2022-10-01 09:42:32 +00:00
let entry_reader = archive.entry_reader(index).await.expect("read entry");
let buf_reader = BufReader::new(entry_reader);
let mut lines = buf_reader.lines();
while let Some(line) = lines.next_line().await.expect("next line") {
tx.send(TextReaderMessage::NextLine(line))
.expect("send line");
}
tx.send(TextReaderMessage::EndOfFile(name))
.expect("send end");
2022-09-19 21:12:49 +00:00
}
2022-09-20 08:39:53 +00:00
2022-10-01 09:42:32 +00:00
println!("read done ✅");
}
/// convert text questions to json format
async fn questions_converter(
mut rx: UnboundedReceiver<TextReaderMessage>,
tx: UnboundedSender<FileText>,
) {
let mut parser = QuestionsParser::new();
while let Some(msg) = rx.recv().await {
match msg {
TextReaderMessage::NextLine(line) => {
let line = line.trim();
if line.is_empty() {
continue;
}
parser.parse_line(line);
}
TextReaderMessage::EndOfFile(name) => {
parser.finish();
let data_json = parser.get_parsed();
let text = data_json.pretty(2);
tx.send(FileText { name, text }).expect("send json");
parser = QuestionsParser::new();
}
}
}
2022-09-20 08:39:53 +00:00
println!("convert done ✅");
2022-09-19 21:12:49 +00:00
}
2022-09-20 11:35:47 +00:00
/// write json data to zip files
2022-10-01 09:42:32 +00:00
async fn zip_json_writer(mut rx: UnboundedReceiver<FileText>) {
let file = fs::File::create(OUTPUT_FILENAME)
.await
.expect("create file");
2022-09-23 20:01:57 +00:00
let mut buf = BufWriter::with_capacity(100 * 1024 * 1024, file);
let mut writer = ZipFileWriter::new(&mut buf);
2022-09-19 21:12:49 +00:00
2022-10-01 09:42:32 +00:00
while let Some(FileText { name, text: data }) = rx.recv().await {
// make output filename
let mut outfilename = PathBuf::from(name);
outfilename.set_extension("json");
let outfilename = outfilename.to_str().unwrap().to_string();
let opts = EntryOptions::new(outfilename, OUTPUT_COMPRESSION);
2022-09-19 21:16:04 +00:00
// write new zip entry
writer
.write_entry_whole(opts, data.as_bytes())
.await
2022-10-01 09:42:32 +00:00
.expect("write entry");
2022-09-19 21:12:49 +00:00
}
2022-10-01 09:42:32 +00:00
writer.close().await.expect("close writer");
buf.flush().await.expect("flush buffer");
2022-09-20 11:31:37 +00:00
2022-09-20 08:39:53 +00:00
println!("write done ✅");
2022-09-19 21:12:49 +00:00
}
2022-08-25 12:50:37 +00:00
2022-09-19 21:12:49 +00:00
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
2022-09-12 19:34:17 +00:00
// check output filename
match fs::metadata(OUTPUT_FILENAME).await {
Ok(x) if x.is_dir() => return Err("output file is a directory!".into()),
2022-09-12 12:59:35 +00:00
_ => (),
2022-08-25 11:45:03 +00:00
};
2022-08-25 12:50:37 +00:00
2022-10-01 09:42:32 +00:00
let (reader_tx, reader_rx) = mpsc::unbounded_channel::<TextReaderMessage>();
let (json_tx, json_rx) = mpsc::unbounded_channel::<FileText>();
2022-09-19 21:12:49 +00:00
2022-09-20 08:39:53 +00:00
tokio::try_join!(
2022-10-01 09:42:32 +00:00
tokio::spawn(zip_text_reader(reader_tx)),
tokio::spawn(questions_converter(reader_rx, json_tx)),
tokio::spawn(zip_json_writer(json_rx))
2022-09-20 08:39:53 +00:00
)?;
2022-08-25 12:50:37 +00:00
2022-09-20 08:39:53 +00:00
println!("all done ✅");
2019-07-27 17:24:49 +00:00
Ok(())
2019-07-25 09:02:25 +00:00
}