anki-word-query/addons21/fastwq/libs/mdict/readmdict.py
2018-07-30 16:00:47 +08:00

967 lines
39 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# readmdict.py
# Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
#
# Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
#
# This program is a free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# You can get a copy of GNU General Public License along this program
# But you can always get it from http://www.gnu.org/licenses/gpl.txt
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
from struct import pack, unpack
from io import BytesIO
import re
import sys
import json
from .ripemd128 import ripemd128
from .pureSalsa20 import Salsa20
# zlib compression is used for engine version >=2.0
import zlib
# LZO compression is used for engine version < 2.0
try:
import lzo
except ImportError:
lzo = None
print("LZO compression support is not available")
# 2x3 compatible
if sys.hexversion >= 0x03000000:
unicode = str
def _unescape_entities(text):
"""
unescape offending tags < > " &
"""
text = text.replace(b'&lt;', b'<')
text = text.replace(b'&gt;', b'>')
text = text.replace(b'&quot;', b'"')
text = text.replace(b'&amp;', b'&')
return text
def _fast_decrypt(data, key):
b = bytearray(data)
key = bytearray(key)
previous = 0x36
for i in range(len(b)):
t = (b[i] >> 4 | b[i] << 4) & 0xff
t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)]
previous = b[i]
b[i] = t
return bytes(b)
def _mdx_decrypt(comp_block):
key = ripemd128(comp_block[4:8] + pack(b'<L', 0x3695))
return comp_block[0:8] + _fast_decrypt(comp_block[8:], key)
def _salsa_decrypt(ciphertext, encrypt_key):
s20 = Salsa20(key=encrypt_key, IV=b"\x00" * 8, rounds=8)
return s20.encryptBytes(ciphertext)
def _decrypt_regcode_by_deviceid(reg_code, deviceid):
deviceid_digest = ripemd128(deviceid)
s20 = Salsa20(key=deviceid_digest, IV=b"\x00" * 8, rounds=8)
encrypt_key = s20.encryptBytes(reg_code)
return encrypt_key
def _decrypt_regcode_by_email(reg_code, email):
email_digest = ripemd128(email.decode().encode('utf-16-le'))
s20 = Salsa20(key=email_digest, IV=b"\x00" * 8, rounds=8)
encrypt_key = s20.encryptBytes(reg_code)
return encrypt_key
class MDict(object):
"""
Base class which reads in header and key block.
It has no public methods and serves only as code sharing base class.
"""
def __init__(self, fname, encoding='', passcode=None):
self._fname = fname
self._encoding = encoding.upper()
self._passcode = passcode
self.header = self._read_header()
try:
self._key_list = self._read_keys()
except:
print("Try Brutal Force on Encrypted Key Blocks")
self._key_list = self._read_keys_brutal()
def __len__(self):
return self._num_entries
def __iter__(self):
return self.keys()
def keys(self):
"""
Return an iterator over dictionary keys.
"""
return (key_value for key_id, key_value in self._key_list)
def _read_number(self, f):
return unpack(self._number_format, f.read(self._number_width))[0]
def _parse_header(self, header):
"""
extract attributes from <Dict attr="value" ... >
"""
taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL)
tagdict = {}
for key, value in taglist:
tagdict[key] = _unescape_entities(value)
return tagdict
def _decode_key_block_info(self, key_block_info_compressed):
if self._version >= 2:
# zlib compression
assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00')
# decrypt if needed
if self._encrypt & 0x02:
key_block_info_compressed = _mdx_decrypt(key_block_info_compressed)
# decompress
key_block_info = zlib.decompress(key_block_info_compressed[8:])
# adler checksum
adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
else:
# no compression
key_block_info = key_block_info_compressed
# decode
key_block_info_list = []
num_entries = 0
i = 0
if self._version >= 2:
byte_format = '>H'
byte_width = 2
text_term = 1
else:
byte_format = '>B'
byte_width = 1
text_term = 0
while i < len(key_block_info):
# number of entries in current key block
num_entries += unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
i += self._number_width
# text head size
text_head_size = unpack(byte_format, key_block_info[i:i + byte_width])[0]
i += byte_width
# text head
if self._encoding != 'UTF-16':
i += text_head_size + text_term
else:
i += (text_head_size + text_term) * 2
# text tail size
text_tail_size = unpack(byte_format, key_block_info[i:i + byte_width])[0]
i += byte_width
# text tail
if self._encoding != 'UTF-16':
i += text_tail_size + text_term
else:
i += (text_tail_size + text_term) * 2
# key block compressed size
key_block_compressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
i += self._number_width
# key block decompressed size
key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
i += self._number_width
key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)]
assert(num_entries == self._num_entries)
return key_block_info_list
def _decode_key_block(self, key_block_compressed, key_block_info_list):
key_list = []
i = 0
for compressed_size, decompressed_size in key_block_info_list:
start = i
end = i + compressed_size
# 4 bytes : compression type
key_block_type = key_block_compressed[start:start + 4]
# 4 bytes : adler checksum of decompressed key block
adler32 = unpack('>I', key_block_compressed[start + 4:start + 8])[0]
if key_block_type == b'\x00\x00\x00\x00':
key_block = key_block_compressed[start + 8:end]
elif key_block_type == b'\x01\x00\x00\x00':
if lzo is None:
print("LZO compression is not supported")
break
# decompress key block
header = b'\xf0' + pack('>I', decompressed_size)
key_block = lzo.decompress(key_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
elif key_block_type == b'\x02\x00\x00\x00':
# decompress key block
key_block = zlib.decompress(key_block_compressed[start + 8:end])
# extract one single key block into a key list
key_list += self._split_key_block(key_block)
# notice that adler32 returns signed value
assert(adler32 == zlib.adler32(key_block) & 0xffffffff)
i += compressed_size
return key_list
def _split_key_block(self, key_block):
key_list = []
key_start_index = 0
while key_start_index < len(key_block):
temp = key_block[key_start_index:key_start_index + self._number_width]
# the corresponding record's offset in record block
key_id = unpack(self._number_format, key_block[key_start_index:key_start_index + self._number_width])[0]
# key text ends with '\x00'
if self._encoding == 'UTF-16':
delimiter = b'\x00\x00'
width = 2
else:
delimiter = b'\x00'
width = 1
i = key_start_index + self._number_width
while i < len(key_block):
if key_block[i:i + width] == delimiter:
key_end_index = i
break
i += width
key_text = key_block[key_start_index + self._number_width:key_end_index]\
.decode(self._encoding, errors='ignore').encode('utf-8').strip()
key_start_index = key_end_index + width
key_list += [(key_id, key_text)]
return key_list
def _read_header(self):
f = open(self._fname, 'rb')
# number of bytes of header text
header_bytes_size = unpack('>I', f.read(4))[0]
header_bytes = f.read(header_bytes_size)
# 4 bytes: adler32 checksum of header, in little endian
adler32 = unpack('<I', f.read(4))[0]
assert(adler32 == zlib.adler32(header_bytes) & 0xffffffff)
# mark down key block offset
self._key_block_offset = f.tell()
f.close()
# header text in utf-16 encoding ending with '\x00\x00'
header_text = header_bytes[:-2].decode('utf-16').encode('utf-8')
header_tag = self._parse_header(header_text)
if not self._encoding:
encoding = header_tag[b'Encoding']
if sys.hexversion >= 0x03000000:
encoding = encoding.decode('utf-8')
# GB18030 > GBK > GB2312
if encoding in ['GBK', 'GB2312']:
encoding = 'GB18030'
self._encoding = encoding
# 读取标题和描述
if b'Title' in header_tag:
self._title = header_tag[b'Title'].decode('utf-8')
else:
self._title = ''
if b'Description' in header_tag:
self._description = header_tag[b'Description'].decode('utf-8')
else:
self._description = ''
pass
# encryption flag
# 0x00 - no encryption
# 0x01 - encrypt record block
# 0x02 - encrypt key info block
if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No':
self._encrypt = 0
elif header_tag[b'Encrypted'] == b'Yes':
self._encrypt = 1
else:
self._encrypt = int(header_tag[b'Encrypted'])
# stylesheet attribute if present takes form of:
# style_number # 1-255
# style_begin # or ''
# style_end # or ''
# store stylesheet in dict in the form of
# {'number' : ('style_begin', 'style_end')}
self._stylesheet = {}
if header_tag.get('StyleSheet'):
lines = header_tag['StyleSheet'].splitlines()
for i in range(0, len(lines), 3):
self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2])
# before version 2.0, number is 4 bytes integer
# version 2.0 and above uses 8 bytes
self._version = float(header_tag[b'GeneratedByEngineVersion'])
if self._version < 2.0:
self._number_width = 4
self._number_format = '>I'
else:
self._number_width = 8
self._number_format = '>Q'
return header_tag
def _read_keys(self):
f = open(self._fname, 'rb')
f.seek(self._key_block_offset)
# the following numbers could be encrypted
if self._version >= 2.0:
num_bytes = 8 * 5
else:
num_bytes = 4 * 4
block = f.read(num_bytes)
if self._encrypt & 1:
if self._passcode is None:
raise RuntimeError('user identification is needed to read encrypted file')
regcode, userid = self._passcode
if isinstance(userid, unicode):
userid = userid.encode('utf8')
if self.header[b'RegisterBy'] == b'EMail':
encrypted_key = _decrypt_regcode_by_email(regcode, userid)
else:
encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid)
block = _salsa_decrypt(block, encrypted_key)
# decode this block
sf = BytesIO(block)
# number of key blocks
num_key_blocks = self._read_number(sf)
# number of entries
self._num_entries = self._read_number(sf)
# number of bytes of key block info after decompression
if self._version >= 2.0:
key_block_info_decomp_size = self._read_number(sf)
# number of bytes of key block info
key_block_info_size = self._read_number(sf)
# number of bytes of key block
key_block_size = self._read_number(sf)
# 4 bytes: adler checksum of previous 5 numbers
if self._version >= 2.0:
adler32 = unpack('>I', f.read(4))[0]
assert adler32 == (zlib.adler32(block) & 0xffffffff)
# read key block info, which indicates key block's compressed and
# decompressed size
key_block_info = f.read(key_block_info_size)
key_block_info_list = self._decode_key_block_info(key_block_info)
assert(num_key_blocks == len(key_block_info_list))
# read key block
key_block_compressed = f.read(key_block_size)
# extract key block
key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
self._record_block_offset = f.tell()
f.close()
return key_list
def _read_keys_brutal(self):
f = open(self._fname, 'rb')
f.seek(self._key_block_offset)
# the following numbers could be encrypted, disregard them!
if self._version >= 2.0:
num_bytes = 8 * 5 + 4
key_block_type = b'\x02\x00\x00\x00'
else:
num_bytes = 4 * 4
key_block_type = b'\x01\x00\x00\x00'
block = f.read(num_bytes)
# key block info
# 4 bytes '\x02\x00\x00\x00'
# 4 bytes adler32 checksum
# unknown number of bytes follows until '\x02\x00\x00\x00' which marks
# the beginning of key block
key_block_info = f.read(8)
if self._version >= 2.0:
assert key_block_info[:4] == b'\x02\x00\x00\x00'
while True:
fpos = f.tell()
t = f.read(1024)
index = t.find(key_block_type)
if index != -1:
key_block_info += t[:index]
f.seek(fpos + index)
break
else:
key_block_info += t
key_block_info_list = self._decode_key_block_info(key_block_info)
key_block_size = sum(list(zip(*key_block_info_list))[0])
# read key block
key_block_compressed = f.read(key_block_size)
# extract key block
key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
self._record_block_offset = f.tell()
f.close()
self._num_entries = len(key_list)
return key_list
class MDD(MDict):
"""
MDict resource file format (*.MDD) reader.
>>> mdd = MDD('example.mdd')
>>> len(mdd)
208
>>> for filename,content in mdd.items():
... print filename, content[:10]
"""
def __init__(self, fname, passcode=None):
MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode)
def items(self):
"""Return a generator which in turn produce tuples in the form of (filename, content)
"""
return self._decode_record_block()
def _decode_record_block(self):
f = open(self._fname, 'rb')
f.seek(self._record_block_offset)
num_record_blocks = self._read_number(f)
num_entries = self._read_number(f)
assert(num_entries == self._num_entries)
record_block_info_size = self._read_number(f)
record_block_size = self._read_number(f)
# record block info section
record_block_info_list = []
size_counter = 0
for i in range(num_record_blocks):
compressed_size = self._read_number(f)
decompressed_size = self._read_number(f)
record_block_info_list += [(compressed_size, decompressed_size)]
size_counter += self._number_width * 2
assert(size_counter == record_block_info_size)
# actual record block
offset = 0
i = 0
size_counter = 0
for compressed_size, decompressed_size in record_block_info_list:
record_block_compressed = f.read(compressed_size)
# 4 bytes: compression type
record_block_type = record_block_compressed[:4]
# 4 bytes: adler32 checksum of decompressed record block
adler32 = unpack('>I', record_block_compressed[4:8])[0]
if record_block_type == b'\x00\x00\x00\x00':
record_block = record_block_compressed[8:]
elif record_block_type == b'\x01\x00\x00\x00':
if lzo is None:
print("LZO compression is not supported")
break
# decompress
header = b'\xf0' + pack('>I', decompressed_size)
record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
elif record_block_type == b'\x02\x00\x00\x00':
# decompress
record_block = zlib.decompress(record_block_compressed[8:])
# notice that adler32 return signed value
assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
assert(len(record_block) == decompressed_size)
# split record block according to the offset info from key block
while i < len(self._key_list):
record_start, key_text = self._key_list[i]
# reach the end of current record block
if record_start - offset >= len(record_block):
break
# record end index
if i < len(self._key_list) - 1:
record_end = self._key_list[i + 1][0]
else:
record_end = len(record_block) + offset
i += 1
data = record_block[record_start - offset:record_end - offset]
yield key_text, data
offset += len(record_block)
size_counter += compressed_size
assert(size_counter == record_block_size)
f.close()
### 获取 mdx 文件的索引列表,格式为
### key_text(关键词,可以由后面的 keylist 得到)
### file_pos(record_block开始的位置)
### compressed_size(record_block压缩前的大小)
### decompressed_size(解压后的大小)
### record_block_type(record_block 的压缩类型)
### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
### record_end
### offset
def get_index(self, check_block = True):
f = open(self._fname, 'rb')
index_dict_list = []
f.seek(self._record_block_offset)
num_record_blocks = self._read_number(f)
num_entries = self._read_number(f)
assert(num_entries == self._num_entries)
record_block_info_size = self._read_number(f)
record_block_size = self._read_number(f)
# record block info section
record_block_info_list = []
size_counter = 0
for i in range(num_record_blocks):
compressed_size = self._read_number(f)
decompressed_size = self._read_number(f)
record_block_info_list += [(compressed_size, decompressed_size)]
size_counter += self._number_width * 2
# todo:注意!!!
assert(size_counter == record_block_info_size)
# actual record block
offset = 0
i = 0
size_counter = 0
for compressed_size, decompressed_size in record_block_info_list:
current_pos = f.tell()
record_block_compressed = f.read(compressed_size)
# 4 bytes: compression type
record_block_type = record_block_compressed[:4]
# 4 bytes: adler32 checksum of decompressed record block
adler32 = unpack('>I', record_block_compressed[4:8])[0]
if record_block_type == b'\x00\x00\x00\x00':
_type = 0
if check_block:
record_block = record_block_compressed[8:]
elif record_block_type == b'\x01\x00\x00\x00':
_type = 1
if lzo is None:
print("LZO compression is not supported")
break
# decompress
header = b'\xf0' + pack('>I', decompressed_size)
if check_block:
record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
elif record_block_type == b'\x02\x00\x00\x00':
# decompress
_type = 2
if check_block:
record_block = zlib.decompress(record_block_compressed[8:])
# notice that adler32 return signed value
if check_block:
assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
assert(len(record_block) == decompressed_size)
# split record block according to the offset info from key block
while i < len(self._key_list):
### 用来保存索引信息的空字典
index_dict = {}
index_dict['file_pos'] = current_pos
index_dict['compressed_size'] = compressed_size
index_dict['decompressed_size'] = decompressed_size
index_dict['record_block_type'] = _type
record_start, key_text = self._key_list[i]
index_dict['record_start'] = record_start
index_dict['key_text'] = key_text.decode("utf-8", errors='ignore')
index_dict['offset'] = offset
# reach the end of current record block
if record_start - offset >= decompressed_size:
break
# record end index
if i < len(self._key_list) - 1:
record_end = self._key_list[i + 1][0]
else:
record_end = decompressed_size + offset
index_dict['record_end'] = record_end
i += 1
if check_block:
data = record_block[record_start - offset:record_end - offset]
index_dict_list.append(index_dict)
#yield key_text, data
offset += decompressed_size
size_counter += compressed_size
assert(size_counter == record_block_size)
f.close()
return index_dict_list
class MDX(MDict):
"""
MDict dictionary file format (*.MDD) reader.
>>> mdx = MDX('example.mdx')
>>> len(mdx)
42481
>>> for key,value in mdx.items():
... print key, value[:10]
"""
def __init__(self, fname, encoding='', substyle=False, passcode=None):
MDict.__init__(self, fname, encoding, passcode)
self._substyle = substyle
def items(self):
"""Return a generator which in turn produce tuples in the form of (key, value)
"""
return self._decode_record_block()
def _substitute_stylesheet(self, txt):
# substitute stylesheet definition
txt_list = re.split('`\d+`', txt)
txt_tag = re.findall('`\d+`', txt)
txt_styled = txt_list[0]
for j, p in enumerate(txt_list[1:]):
style = self._stylesheet[txt_tag[j][1:-1]]
if p and p[-1] == '\n':
txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
else:
txt_styled = txt_styled + style[0] + p + style[1]
return txt_styled
def _decode_record_block(self):
f = open(self._fname, 'rb')
f.seek(self._record_block_offset)
num_record_blocks = self._read_number(f)
num_entries = self._read_number(f)
assert(num_entries == self._num_entries)
record_block_info_size = self._read_number(f)
record_block_size = self._read_number(f)
# record block info section
record_block_info_list = []
size_counter = 0
for i in range(num_record_blocks):
compressed_size = self._read_number(f)
decompressed_size = self._read_number(f)
record_block_info_list += [(compressed_size, decompressed_size)]
size_counter += self._number_width * 2
assert(size_counter == record_block_info_size)
# actual record block data
offset = 0
i = 0
size_counter = 0
###最后的索引表的格式为
### key_text(关键词,可以由后面的 keylist 得到)
### file_pos(record_block开始的位置)
### compressed_size(record_block压缩前的大小)
### decompressed_size(解压后的大小)
### record_block_type(record_block 的压缩类型)
### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
### record_end
### offset
for compressed_size, decompressed_size in record_block_info_list:
record_block_compressed = f.read(compressed_size)
###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录)
###### 另外还需要记录当前 f 对象的位置
###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
# 4 bytes indicates block compression type
record_block_type = record_block_compressed[:4]
# 4 bytes adler checksum of uncompressed content
adler32 = unpack('>I', record_block_compressed[4:8])[0]
# no compression
if record_block_type == b'\x00\x00\x00\x00':
record_block = record_block_compressed[8:]
# lzo compression
elif record_block_type == b'\x01\x00\x00\x00':
if lzo is None:
print("LZO compression is not supported")
break
# decompress
header = b'\xf0' + pack('>I', decompressed_size)
record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
# zlib compression
elif record_block_type == b'\x02\x00\x00\x00':
# decompress
record_block = zlib.decompress(record_block_compressed[8:])
###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法
###### 需要的信息有 record_block_compressed, decompress_size,
###### record_block_type
###### 另外还需要校验信息 adler32
# notice that adler32 return signed value
assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
assert(len(record_block) == decompressed_size)
# split record block according to the offset info from key block
while i < len(self._key_list):
record_start, key_text = self._key_list[i]
# reach the end of current record block
if record_start - offset >= len(record_block):
break
# record end index
if i < len(self._key_list) - 1:
record_end = self._key_list[i + 1][0]
else:
record_end = len(record_block) + offset
i += 1
#############需要得到 record_block , record_start, record_end,
#############offset
record = record_block[record_start - offset:record_end - offset]
# convert to utf-8
record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
# substitute styles
#############是否替换样式表
if self._substyle and self._stylesheet:
record = self._substitute_stylesheet(record)
yield key_text, record
offset += len(record_block)
size_counter += compressed_size
assert(size_counter == record_block_size)
f.close()
### 获取 mdx 文件的索引列表,格式为
### key_text(关键词,可以由后面的 keylist 得到)
### file_pos(record_block开始的位置)
### compressed_size(record_block压缩前的大小)
### decompressed_size(解压后的大小)
### record_block_type(record_block 的压缩类型)
### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
### record_end
### offset
### 所需 metadata
###
def get_index(self, check_block = True):
### 索引列表
index_dict_list = []
f = open(self._fname, 'rb')
f.seek(self._record_block_offset)
num_record_blocks = self._read_number(f)
num_entries = self._read_number(f)
assert(num_entries == self._num_entries)
record_block_info_size = self._read_number(f)
record_block_size = self._read_number(f)
# record block info section
record_block_info_list = []
size_counter = 0
for i in range(num_record_blocks):
compressed_size = self._read_number(f)
decompressed_size = self._read_number(f)
record_block_info_list += [(compressed_size, decompressed_size)]
size_counter += self._number_width * 2
assert(size_counter == record_block_info_size)
# actual record block data
offset = 0
i = 0
size_counter = 0
###最后的索引表的格式为
### key_text(关键词,可以由后面的 keylist 得到)
### file_pos(record_block开始的位置)
### compressed_size(record_block压缩前的大小)
### decompressed_size(解压后的大小)
### record_block_type(record_block 的压缩类型)
### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
### record_end
### offset
for compressed_size, decompressed_size in record_block_info_list:
current_pos = f.tell()
record_block_compressed = f.read(compressed_size)
###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录)
###### 另外还需要记录当前 f 对象的位置
###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
# 4 bytes indicates block compression type
record_block_type = record_block_compressed[:4]
# 4 bytes adler checksum of uncompressed content
adler32 = unpack('>I', record_block_compressed[4:8])[0]
# no compression
if record_block_type == b'\x00\x00\x00\x00':
_type = 0
record_block = record_block_compressed[8:]
# lzo compression
elif record_block_type == b'\x01\x00\x00\x00':
_type = 1
if lzo is None:
print("LZO compression is not supported")
break
# decompress
header = b'\xf0' + pack('>I', decompressed_size)
if check_block:
record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
# zlib compression
elif record_block_type == b'\x02\x00\x00\x00':
# decompress
_type = 2
if check_block:
record_block = zlib.decompress(record_block_compressed[8:])
###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法
###### 需要的信息有 record_block_compressed, decompress_size,
###### record_block_type
###### 另外还需要校验信息 adler32
# notice that adler32 return signed value
if check_block:
assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
assert(len(record_block) == decompressed_size)
# split record block according to the offset info from key block
while i < len(self._key_list):
### 用来保存索引信息的空字典
index_dict = {}
index_dict['file_pos'] = current_pos
index_dict['compressed_size'] = compressed_size
index_dict['decompressed_size'] = decompressed_size
index_dict['record_block_type'] = _type
record_start, key_text = self._key_list[i]
index_dict['record_start'] = record_start
index_dict['key_text'] = key_text.decode('utf-8', errors='ignore')
index_dict['offset'] = offset
# reach the end of current record block
if record_start - offset >= decompressed_size:
break
# record end index
if i < len(self._key_list) - 1:
record_end = self._key_list[i + 1][0]
else:
record_end = decompressed_size + offset
index_dict['record_end'] = record_end
i += 1
#############需要得到 record_block , record_start, record_end,
#############offset
if check_block:
record = record_block[record_start - offset:record_end - offset]
# convert to utf-8
record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
# substitute styles
#############是否替换样式表
if self._substyle and self._stylesheet:
record = self._substitute_stylesheet(record)
index_dict_list.append(index_dict)
offset += decompressed_size
size_counter += compressed_size
#todo: 注意!!!
#assert(size_counter == record_block_size)
f.close
#这里比 mdd 部分稍有不同,应该还需要传递编码以及样式表信息
meta = {}
meta['encoding'] = self._encoding
meta['stylesheet'] = json.dumps(self._stylesheet)
meta['title'] = self._title
meta['description'] = self._description
return {"index_dict_list":index_dict_list, 'meta':meta}
if __name__ == '__main__':
import sys
import os
import os.path
import argparse
import codecs
def passcode(s):
try:
regcode, userid = s.split(',')
except:
raise argparse.ArgumentTypeError("Passcode must be regcode,userid")
try:
regcode = codecs.decode(regcode, 'hex')
except:
raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string")
return regcode, userid
parser = argparse.ArgumentParser()
parser.add_argument('-x', '--extract', action="store_true",
help='extract mdx to source format and extract files from mdd')
parser.add_argument('-s', '--substyle', action="store_true",
help='substitute style definition if present')
parser.add_argument('-d', '--datafolder', default="data",
help='folder to extract data files from mdd')
parser.add_argument('-e', '--encoding', default="",
help='folder to extract data files from mdd')
parser.add_argument('-p', '--passcode', default=None, type=passcode,
help='register_code,email_or_deviceid')
parser.add_argument("filename", nargs='?', help="mdx file name")
args = parser.parse_args()
# use GUI to select file, default to extract
if not args.filename:
import Tkinter
import tkFileDialog
root = Tkinter.Tk()
root.withdraw()
args.filename = tkFileDialog.askopenfilename(parent=root)
args.extract = True
if not os.path.exists(args.filename):
print("Please specify a valid MDX/MDD file")
base, ext = os.path.splitext(args.filename)
# read mdx file
if ext.lower() == os.path.extsep + 'mdx':
mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode)
if type(args.filename) is unicode:
bfname = args.filename.encode('utf-8')
else:
bfname = args.filename
print('======== %s ========' % bfname)
print(' Number of Entries : %d' % len(mdx))
for key, value in mdx.header.items():
print(' %s : %s' % (key, value))
else:
mdx = None
# find companion mdd file
mdd_filename = ''.join([base, os.path.extsep, 'mdd'])
if os.path.exists(mdd_filename):
mdd = MDD(mdd_filename, args.passcode)
if type(mdd_filename) is unicode:
bfname = mdd_filename.encode('utf-8')
else:
bfname = mdd_filename
print('======== %s ========' % bfname)
print(' Number of Entries : %d' % len(mdd))
for key, value in mdd.header.items():
print(' %s : %s' % (key, value))
else:
mdd = None
if args.extract:
# write out glos
if mdx:
output_fname = ''.join([base, os.path.extsep, 'txt'])
tf = open(output_fname, 'wb')
for key, value in mdx.items():
tf.write(key)
tf.write(b'\r\n')
tf.write(value)
if not value.endswith(b'\n'):
tf.write(b'\r\n')
tf.write(b'</>\r\n')
tf.close()
# write out style
if mdx.header.get('StyleSheet'):
style_fname = ''.join([base, '_style', os.path.extsep, 'txt'])
sf = open(style_fname, 'wb')
sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines()))
sf.close()
# write out optional data files
if mdd:
datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder)
if not os.path.exists(datafolder):
os.makedirs(datafolder)
for key, value in mdd.items():
fname = key.decode('utf-8').replace('\\', os.path.sep)
dfname = datafolder + fname
if not os.path.exists(os.path.dirname(dfname)):
os.makedirs(os.path.dirname(dfname))
df = open(dfname, 'wb')
df.write(value)
df.close()