This commit is contained in:
Dmitry Belyaev 2019-07-26 12:24:25 +03:00
parent c521489c08
commit cbea45413c
Signed by: b4tman
GPG Key ID: 41A00BF15EA7E5F3
4 changed files with 436 additions and 2 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
/target
**/*.rs.bk
baza.zip

244
Cargo.lock generated Normal file
View File

@ -0,0 +1,244 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "adler32"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "byteorder"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "bzip2"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "bzip2-sys"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cc 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cc"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "cfg-if"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "chgk_txt2json"
version = "0.1.0"
dependencies = [
"encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)",
"json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)",
"textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crc32fast"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding"
version = "0.2.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding-index-japanese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding-index-korean"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding-index-simpchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding-index-singlebyte"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding-index-tradchinese"
version = "1.20141219.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding_index_tests"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "json"
version = "0.11.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libc"
version = "0.2.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libflate"
version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
"byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rle-decode-fast 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"take_mut 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "memchr"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "podio"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "redox_syscall"
version = "0.1.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rle-decode-fast"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "take_mut"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "textstream"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "time"
version = "0.1.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "zip"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libflate 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)",
"podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum adler32 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7e522997b529f05601e05166c07ed17789691f562762c7f3b987263d2dedee5c"
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
"checksum bzip2 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "42b7c3cbf0fa9c1b82308d57191728ca0256cb821220f4e2fd410a72ade26e3b"
"checksum bzip2-sys 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "6584aa36f5ad4c9247f5323b0a42f37802b37a836f0ad87084d7a33961abe25f"
"checksum cc 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "ce400c638d48ee0e9ab75aef7997609ec57367ccfe1463f21bf53c3eca67bf46"
"checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
"checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
"checksum encoding 0.2.33 (registry+https://github.com/rust-lang/crates.io-index)" = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec"
"checksum encoding-index-japanese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91"
"checksum encoding-index-korean 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81"
"checksum encoding-index-simpchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7"
"checksum encoding-index-singlebyte 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a"
"checksum encoding-index-tradchinese 1.20141219.5 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18"
"checksum encoding_index_tests 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569"
"checksum json 0.11.14 (registry+https://github.com/rust-lang/crates.io-index)" = "01d7903059b22f1f09ced2fb9562507e3556a953caa2f835c64ab022bb6148c2"
"checksum libc 0.2.60 (registry+https://github.com/rust-lang/crates.io-index)" = "d44e80633f007889c7eff624b709ab43c92d708caad982295768a7b13ca3b5eb"
"checksum libflate 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)" = "90c6f86f4b0caa347206f916f8b687b51d77c6ef8ff18d52dd007491fd580529"
"checksum memchr 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "148fab2e51b4f1cfc66da2a7c32981d1d3c083a803978268bb11fe4b86925e7a"
"checksum podio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "780fb4b6698bbf9cf2444ea5d22411cef2953f0824b98f33cf454ec5615645bd"
"checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
"checksum rle-decode-fast 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe4fa914dec5870285fa7f71f602645da47c486e68486d2b4ceb4a343e90ac"
"checksum take_mut 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
"checksum textstream 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "e7ed81b342f6566026755e7f4b7798810b1c159722e427d212ce72c2c58ffdaa"
"checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f"
"checksum winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "f10e386af2b13e47c89e7236a7a14a086791a2b88ebad6df9bf42040195cf770"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
"checksum zip 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c18fc320faf909036e46ac785ea827f72e485304877faf1a3a39538d3714dbc3"

View File

@ -7,3 +7,7 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
zip = "0.5"
encoding = "0.2"
textstream = "0.1"
json="0.11"

View File

@ -1,3 +1,188 @@
fn main() {
println!("Hello, world!");
extern crate encoding;
extern crate json;
extern crate textstream;
extern crate zip;
use encoding::all::KOI8_R;
use encoding::DecoderTrap;
use std::fs;
use std::io::BufReader;
use std::io::Read;
use std::path::PathBuf;
use textstream::TextReader;
#[derive(Debug, Clone, Copy)]
enum KeywordType {
Global,
QuestionPre,
QuestionStart,
QuestionContent,
}
fn keyword_type(pattern: &str) -> KeywordType {
match pattern {
"Чемпионат:" | "URL:" | "Дата:" | "Редактор:" | "Вид:" => {
KeywordType::Global
}
"Тур:" => KeywordType::QuestionPre,
"Вопрос " => KeywordType::QuestionStart,
_ => KeywordType::QuestionContent,
}
}
struct Context {
// global output value
data: json::JsonValue,
// temp questions array
questions: json::JsonValue,
cur_keyword_type: Option<KeywordType>,
// temp question value
cur_question: json::JsonValue,
// temp value for pre'question fields
cur_question_pre: json::JsonValue,
// curent json key
cur_tag: String,
// current json value
cur_content: Vec<String>,
// need to push temp question value if true
have_new_question: bool,
// prev. keyword type
last_keyword_type: Option<KeywordType>,
// prev. json key (used for store acummulated content when new keyword readed)
last_tag: String,
}
fn parse_file<R: Read>(file: R) -> Result<json::JsonValue, Box<std::error::Error>> {
let buf = BufReader::new(file);
let reader = TextReader::new(buf, KOI8_R, DecoderTrap::Ignore);
let patterns = vec![
"Чемпионат:",
"URL:",
"Дата:",
"Редактор:",
"Вид:",
"Тур:",
"Вопрос ",
"Ответ:",
"Зачет:",
"Источник:",
"Автор:",
"Комментарий:",
];
// init context
let mut context = Context {
data: json::JsonValue::new_object(),
questions: json::JsonValue::new_array(),
cur_keyword_type: None,
cur_question: json::JsonValue::new_object(),
cur_question_pre: json::JsonValue::new_object(),
cur_tag: String::new(),
cur_content: Vec::<String>::new(),
have_new_question: false,
last_keyword_type: None,
last_tag: String::new(),
};
let mut ctx = &mut context;
for line in reader.lines() {
// ignore empty lines
let line_str = String::from(line.unwrap().trim());
let line_s = &line_str;
if 0 == line_s.len() {
continue;
}
// find keywords
match patterns
.iter()
.find(|&&pattern| line_s.starts_with(pattern) && line_s.ends_with(":"))
{
Some(pattern) => {
ctx.last_keyword_type = ctx.cur_keyword_type;
ctx.last_tag = ctx.cur_tag.clone();
ctx.cur_keyword_type = Some(keyword_type(&pattern));
ctx.cur_tag = pattern.replace(" ", "").replace(":", "");
// remember question id
match ctx.cur_keyword_type {
Some(KeywordType::QuestionStart) => {
ctx.cur_question_pre["id"] = line_s.replace(":", "").as_str().into()
}
_ => (),
};
// apply accumulated content when new keyword found
match ctx.last_keyword_type {
Some(KeywordType::Global) => {
ctx.data[&ctx.last_tag] = ctx.cur_content.join("\n").into()
}
Some(KeywordType::QuestionPre) => {
ctx.cur_question_pre[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(KeywordType::QuestionContent) => {
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
}
Some(KeywordType::QuestionStart) => {
// store prev question before reading new
if ctx.have_new_question {
ctx.questions.push(ctx.cur_question.clone()).unwrap();
}
// prepare for read new question data with cur_question_pre values
ctx.cur_question = ctx.cur_question_pre.clone();
ctx.cur_question_pre = json::JsonValue::new_object();
ctx.cur_question[&ctx.last_tag] = ctx.cur_content.join("\n").into();
ctx.have_new_question = true;
}
None => (),
};
// clear content
ctx.cur_content.clear();
}
None => {
// accumulate content if line is not a keyword
ctx.cur_content.push(String::from(line_s));
}
}
}
// finish reading last question
if ctx.have_new_question && !ctx.cur_content.is_empty() {
ctx.cur_question[&ctx.cur_tag] = ctx.cur_content.join("\n").as_str().into();
ctx.questions.push(ctx.cur_question.clone()).unwrap();
ctx.have_new_question = false;
}
ctx.data["Вопросы"] = ctx.questions.clone();
Ok(ctx.data.clone())
}
fn main() -> Result<(), Box<std::error::Error>> {
let fname = std::path::Path::new("./baza.zip");
let zip_file = fs::File::open(&fname)?;
let zip_reader = BufReader::new(zip_file);
let mut archive = zip::ZipArchive::new(zip_reader)?;
for i in 0..archive.len() {
let file = archive.by_index(i)?;
// FIXME
//if ! file.is_file() {
// continue;
//}
let name = file.sanitized_name();
println!("{}", name.as_path().display());
let data: json::JsonValue = parse_file(file)?;
let mut outfilename = PathBuf::from("./json");
outfilename.push(name);
outfilename.set_extension("json");
let mut outfile = fs::File::create(outfilename)?;
data.write_pretty(&mut outfile, 1)?;
//data.write(&mut outfile)?;
//debug
//break;
}
Ok(())
}