bug fixes

This commit is contained in:
sthoo 2019-03-09 23:06:55 +08:00
parent 27dd65c635
commit 0e502406d1

View File

@ -10,10 +10,10 @@ import zlib
from io import BytesIO from io import BytesIO
from struct import pack, unpack from struct import pack, unpack
import chardet
from .readmdict import MDD, MDX from .readmdict import MDD, MDX
# import chardet
# LZO compression is used for engine version < 2.0 # LZO compression is used for engine version < 2.0
try: try:
import lzo import lzo
@ -30,7 +30,14 @@ version = '1.1'
class IndexBuilder(object): class IndexBuilder(object):
#todo: enable history #todo: enable history
def __init__(self, fname, encoding = "", passcode = None, force_rebuild = False, enable_history = False, sql_index = True, check = False): def __init__(self,
fname,
encoding="",
passcode=None,
force_rebuild=False,
enable_history=False,
sql_index=True,
check=False):
self._mdx_file = fname self._mdx_file = fname
self._mdd_file = "" self._mdd_file = ""
self._encoding = '' self._encoding = ''
@ -41,8 +48,8 @@ class IndexBuilder(object):
self._sql_index = sql_index self._sql_index = sql_index
self._check = check self._check = check
_filename, _file_extension = os.path.splitext(fname) _filename, _file_extension = os.path.splitext(fname)
assert(_file_extension == '.mdx') assert (_file_extension == '.mdx')
assert(os.path.isfile(fname)) assert (os.path.isfile(fname))
self._mdx_db = _filename + ".mdx.db" self._mdx_db = _filename + ".mdx.db"
# make index anyway # make index anyway
if force_rebuild: if force_rebuild:
@ -72,10 +79,12 @@ class IndexBuilder(object):
self._make_mdd_index(self._mdd_db) self._make_mdd_index(self._mdd_db)
print("mdd.db rebuilt!") print("mdd.db rebuilt!")
return None return None
cursor = conn.execute("SELECT * FROM META WHERE key = \"encoding\"") cursor = conn.execute(
"SELECT * FROM META WHERE key = \"encoding\"")
for cc in cursor: for cc in cursor:
self._encoding = cc[1] self._encoding = cc[1]
cursor = conn.execute("SELECT * FROM META WHERE key = \"stylesheet\"") cursor = conn.execute(
"SELECT * FROM META WHERE key = \"stylesheet\"")
for cc in cursor: for cc in cursor:
self._stylesheet = json.loads(cc[1]) self._stylesheet = json.loads(cc[1])
@ -83,7 +92,8 @@ class IndexBuilder(object):
for cc in cursor: for cc in cursor:
self._title = cc[1] self._title = cc[1]
cursor = conn.execute("SELECT * FROM META WHERE key = \"description\"") cursor = conn.execute(
"SELECT * FROM META WHERE key = \"description\"")
for cc in cursor: for cc in cursor:
self._description = cc[1] self._description = cc[1]
@ -108,14 +118,13 @@ class IndexBuilder(object):
if not os.path.isfile(self._mdd_db): if not os.path.isfile(self._mdd_db):
self._make_mdd_index(self._mdd_db) self._make_mdd_index(self._mdd_db)
pass pass
def _replace_stylesheet(self, txt): def _replace_stylesheet(self, txt):
# substitute stylesheet definition # substitute stylesheet definition
encoding = 'utf-8' encoding = 'utf-8'
if isinstance(txt, bytes): if isinstance(txt, bytes):
encode_type = chardet.detect(txt) # encode_type = chardet.detect(txt)
encoding = encode_type['encoding'] # encoding = encode_type['encoding']
txt = txt.decode(encoding) txt = txt.decode(encoding)
txt_list = re.split('`\d+`', txt) txt_list = re.split('`\d+`', txt)
txt_tag = re.findall('`\d+`', txt) txt_tag = re.findall('`\d+`', txt)
@ -134,12 +143,11 @@ class IndexBuilder(object):
os.remove(db_name) os.remove(db_name)
mdx = MDX(self._mdx_file) mdx = MDX(self._mdx_file)
self._mdx_db = db_name self._mdx_db = db_name
returned_index = mdx.get_index(check_block = self._check) returned_index = mdx.get_index(check_block=self._check)
index_list = returned_index['index_dict_list'] index_list = returned_index['index_dict_list']
conn = sqlite3.connect(db_name) conn = sqlite3.connect(db_name)
c = conn.cursor() c = conn.cursor()
c.execute( c.execute(''' CREATE TABLE MDX_INDEX
''' CREATE TABLE MDX_INDEX
(key_text text not null, (key_text text not null,
file_pos integer, file_pos integer,
compressed_size integer, compressed_size integer,
@ -148,53 +156,39 @@ class IndexBuilder(object):
record_start integer, record_start integer,
record_end integer, record_end integer,
offset integer offset integer
)''' )''')
)
tuple_list = [ tuple_list = [(item['key_text'], item['file_pos'],
(item['key_text'], item['compressed_size'], item['decompressed_size'],
item['file_pos'], item['record_block_type'], item['record_start'],
item['compressed_size'], item['record_end'], item['offset'])
item['decompressed_size'], for item in index_list]
item['record_block_type'],
item['record_start'],
item['record_end'],
item['offset']
)
for item in index_list
]
c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
tuple_list) tuple_list)
# build the metadata table # build the metadata table
meta = returned_index['meta'] meta = returned_index['meta']
c.execute( c.execute('''CREATE TABLE META
'''CREATE TABLE META
(key text, (key text,
value text value text
)''') )''')
#for k,v in meta: #for k,v in meta:
# c.execute( # c.execute(
# 'INSERT INTO META VALUES (?,?)', # 'INSERT INTO META VALUES (?,?)',
# (k, v) # (k, v)
# ) # )
c.executemany( c.executemany('INSERT INTO META VALUES (?,?)',
'INSERT INTO META VALUES (?,?)', [('encoding', meta['encoding']),
[('encoding', meta['encoding']), ('stylesheet', meta['stylesheet']),
('stylesheet', meta['stylesheet']), ('title', meta['title']),
('title', meta['title']), ('description', meta['description']),
('description', meta['description']), ('version', version)])
('version', version)
]
)
if self._sql_index: if self._sql_index:
c.execute( c.execute('''
'''
CREATE INDEX key_index ON MDX_INDEX (key_text) CREATE INDEX key_index ON MDX_INDEX (key_text)
''' ''')
)
conn.commit() conn.commit()
conn.close() conn.close()
@ -204,17 +198,15 @@ class IndexBuilder(object):
self._title = meta['title'] self._title = meta['title']
self._description = meta['description'] self._description = meta['description']
def _make_mdd_index(self, db_name): def _make_mdd_index(self, db_name):
if os.path.exists(db_name): if os.path.exists(db_name):
os.remove(db_name) os.remove(db_name)
mdd = MDD(self._mdd_file) mdd = MDD(self._mdd_file)
self._mdd_db = db_name self._mdd_db = db_name
index_list = mdd.get_index(check_block = self._check) index_list = mdd.get_index(check_block=self._check)
conn = sqlite3.connect(db_name) conn = sqlite3.connect(db_name)
c = conn.cursor() c = conn.cursor()
c.execute( c.execute(''' CREATE TABLE MDX_INDEX
''' CREATE TABLE MDX_INDEX
(key_text text not null unique, (key_text text not null unique,
file_pos integer, file_pos integer,
compressed_size integer, compressed_size integer,
@ -223,29 +215,19 @@ class IndexBuilder(object):
record_start integer, record_start integer,
record_end integer, record_end integer,
offset integer offset integer
)''' )''')
)
tuple_list = [ tuple_list = [(item['key_text'], item['file_pos'],
(item['key_text'], item['compressed_size'], item['decompressed_size'],
item['file_pos'], item['record_block_type'], item['record_start'],
item['compressed_size'], item['record_end'], item['offset'])
item['decompressed_size'], for item in index_list]
item['record_block_type'],
item['record_start'],
item['record_end'],
item['offset']
)
for item in index_list
]
c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
tuple_list) tuple_list)
if self._sql_index: if self._sql_index:
c.execute( c.execute('''
'''
CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text) CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text)
''' ''')
)
conn.commit() conn.commit()
conn.close() conn.close()
@ -266,32 +248,40 @@ class IndexBuilder(object):
print("LZO compression is not supported") print("LZO compression is not supported")
# decompress # decompress
header = b'\xf0' + pack('>I', index['decompressed_size']) header = b'\xf0' + pack('>I', index['decompressed_size'])
_record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672) _record_block = lzo.decompress(
# zlib compression record_block_compressed[8:],
initSize=decompressed_size,
blockSize=1308672)
# zlib compression
elif record_block_type == 2: elif record_block_type == 2:
# decompress # decompress
_record_block = zlib.decompress(record_block_compressed[8:]) _record_block = zlib.decompress(record_block_compressed[8:])
data = _record_block[index['record_start'] - index['offset']:index['record_end'] - index['offset']] data = _record_block[index['record_start'] -
index['offset']:index['record_end'] -
index['offset']]
return data return data
def get_mdx_by_index(self, fmdx, index): def get_mdx_by_index(self, fmdx, index):
data = self.get_data_by_index(fmdx,index) data = self.get_data_by_index(fmdx, index)
record = data.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8') record = data.decode(
self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
if self._stylesheet: if self._stylesheet:
record = self._replace_stylesheet(record) record = self._replace_stylesheet(record)
record = record.decode('utf-8') record = record.decode('utf-8')
return record return record
def get_mdd_by_index(self, fmdx, index): def get_mdd_by_index(self, fmdx, index):
return self.get_data_by_index(fmdx,index) return self.get_data_by_index(fmdx, index)
@staticmethod @staticmethod
def lookup_indexes(db,keyword,ignorecase=None): def lookup_indexes(db, keyword, ignorecase=None):
indexes = [] indexes = []
if ignorecase: if ignorecase:
sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format(keyword) sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format(
keyword)
else: else:
sql = 'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format(keyword) sql = 'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format(
keyword)
with sqlite3.connect(db) as conn: with sqlite3.connect(db) as conn:
cursor = conn.execute(sql) cursor = conn.execute(sql)
for result in cursor: for result in cursor:
@ -306,29 +296,31 @@ class IndexBuilder(object):
indexes.append(index) indexes.append(index)
return indexes return indexes
def mdx_lookup(self, keyword,ignorecase=None): def mdx_lookup(self, keyword, ignorecase=None):
lookup_result_list = [] lookup_result_list = []
indexes = self.lookup_indexes(self._mdx_db,keyword,ignorecase) indexes = self.lookup_indexes(self._mdx_db, keyword, ignorecase)
with open(self._mdx_file,'rb') as mdx_file: with open(self._mdx_file, 'rb') as mdx_file:
for index in indexes: for index in indexes:
lookup_result_list.append(self.get_mdx_by_index(mdx_file, index)) lookup_result_list.append(
self.get_mdx_by_index(mdx_file, index))
return lookup_result_list return lookup_result_list
def mdd_lookup(self, keyword,ignorecase=None): def mdd_lookup(self, keyword, ignorecase=None):
lookup_result_list = [] lookup_result_list = []
indexes = self.lookup_indexes(self._mdd_db,keyword,ignorecase) indexes = self.lookup_indexes(self._mdd_db, keyword, ignorecase)
with open(self._mdd_file,'rb') as mdd_file: with open(self._mdd_file, 'rb') as mdd_file:
for index in indexes: for index in indexes:
lookup_result_list.append(self.get_mdd_by_index(mdd_file, index)) lookup_result_list.append(
self.get_mdd_by_index(mdd_file, index))
return lookup_result_list return lookup_result_list
@staticmethod @staticmethod
def get_keys(db,query = ''): def get_keys(db, query=''):
if not db: if not db:
return [] return []
if query: if query:
if '*' in query: if '*' in query:
query = query.replace('*','%') query = query.replace('*', '%')
else: else:
query = query + '%' query = query + '%'
sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE \"' + query + '\"' sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE \"' + query + '\"'
@ -339,12 +331,11 @@ class IndexBuilder(object):
keys = [item[0] for item in cursor] keys = [item[0] for item in cursor]
return keys return keys
def get_mdd_keys(self, query = ''): def get_mdd_keys(self, query=''):
return self.get_keys(self._mdd_db,query) return self.get_keys(self._mdd_db, query)
def get_mdx_keys(self, query = ''):
return self.get_keys(self._mdx_db,query)
def get_mdx_keys(self, query=''):
return self.get_keys(self._mdx_db, query)
# mdx_builder = IndexBuilder("oald.mdx") # mdx_builder = IndexBuilder("oald.mdx")
@ -353,5 +344,5 @@ class IndexBuilder(object):
# keys1 = mdx_builder.get_mdx_keys('abstrac') # keys1 = mdx_builder.get_mdx_keys('abstrac')
# keys2 = mdx_builder.get_mdx_keys('*tion') # keys2 = mdx_builder.get_mdx_keys('*tion')
# for key in keys2: # for key in keys2:
# text = mdx_builder.mdx_lookup(key)[0] # text = mdx_builder.mdx_lookup(key)[0]
# pass # pass