anki-word-query/addons21/fastwq/libs/mdict/mdict_query.py

349 lines
12 KiB
Python
Raw Permalink Normal View History

2018-07-01 10:55:30 +08:00
# -*- coding: utf-8 -*-
2019-02-09 12:06:48 +08:00
import json
2018-07-01 10:55:30 +08:00
import os
2019-02-09 12:06:48 +08:00
import re
2018-07-01 10:55:30 +08:00
import sqlite3
2019-02-09 12:06:48 +08:00
import sys
2018-07-01 10:55:30 +08:00
# zlib compression is used for engine version >=2.0
import zlib
2019-02-09 12:06:48 +08:00
from io import BytesIO
from struct import pack, unpack
from .readmdict import MDD, MDX
2019-03-09 23:06:55 +08:00
# import chardet
2018-07-01 10:55:30 +08:00
# LZO compression is used for engine version < 2.0
try:
import lzo
except ImportError:
lzo = None
#print("LZO compression support is not available")
# 2x3 compatible
if sys.hexversion >= 0x03000000:
unicode = str
version = '1.1'
class IndexBuilder(object):
#todo: enable history
2019-03-09 23:06:55 +08:00
def __init__(self,
fname,
encoding="",
passcode=None,
force_rebuild=False,
enable_history=False,
sql_index=True,
check=False):
2018-07-01 10:55:30 +08:00
self._mdx_file = fname
self._mdd_file = ""
2018-07-01 10:55:30 +08:00
self._encoding = ''
self._stylesheet = {}
self._title = ''
self._version = ''
self._description = ''
self._sql_index = sql_index
self._check = check
_filename, _file_extension = os.path.splitext(fname)
2019-03-09 23:06:55 +08:00
assert (_file_extension == '.mdx')
assert (os.path.isfile(fname))
2018-07-01 10:55:30 +08:00
self._mdx_db = _filename + ".mdx.db"
# make index anyway
if force_rebuild:
self._make_mdx_index(self._mdx_db)
if os.path.isfile(_filename + '.mdd'):
self._mdd_file = _filename + ".mdd"
self._mdd_db = _filename + ".mdd.db"
self._make_mdd_index(self._mdd_db)
2018-07-01 10:55:30 +08:00
if os.path.isfile(self._mdx_db):
#read from META table
conn = sqlite3.connect(self._mdx_db)
#cursor = conn.execute("SELECT * FROM META")
cursor = conn.execute("SELECT * FROM META WHERE key = \"version\"")
#判断有无版本号
for cc in cursor:
self._version = cc[1]
################# if not version in fo #############
if not self._version:
print("version info not found")
2018-07-01 10:55:30 +08:00
conn.close()
self._make_mdx_index(self._mdx_db)
print("mdx.db rebuilt!")
if os.path.isfile(_filename + '.mdd'):
self._mdd_file = _filename + ".mdd"
self._mdd_db = _filename + ".mdd.db"
self._make_mdd_index(self._mdd_db)
print("mdd.db rebuilt!")
return None
2019-03-09 23:06:55 +08:00
cursor = conn.execute(
"SELECT * FROM META WHERE key = \"encoding\"")
for cc in cursor:
self._encoding = cc[1]
2019-03-09 23:06:55 +08:00
cursor = conn.execute(
"SELECT * FROM META WHERE key = \"stylesheet\"")
for cc in cursor:
self._stylesheet = json.loads(cc[1])
2018-07-01 10:55:30 +08:00
cursor = conn.execute("SELECT * FROM META WHERE key = \"title\"")
for cc in cursor:
self._title = cc[1]
2018-07-01 10:55:30 +08:00
2019-03-09 23:06:55 +08:00
cursor = conn.execute(
"SELECT * FROM META WHERE key = \"description\"")
for cc in cursor:
self._description = cc[1]
#for cc in cursor:
# if cc[0] == 'encoding':
# self._encoding = cc[1]
# continue
# if cc[0] == 'stylesheet':
# self._stylesheet = json.loads(cc[1])
# continue
# if cc[0] == 'title':
# self._title = cc[1]
# continue
# if cc[0] == 'title':
# self._description = cc[1]
else:
self._make_mdx_index(self._mdx_db)
2018-07-01 10:55:30 +08:00
if os.path.isfile(_filename + ".mdd"):
self._mdd_file = _filename + ".mdd"
self._mdd_db = _filename + ".mdd.db"
if not os.path.isfile(self._mdd_db):
self._make_mdd_index(self._mdd_db)
pass
2018-07-01 10:55:30 +08:00
def _replace_stylesheet(self, txt):
# substitute stylesheet definition
2019-02-09 12:06:48 +08:00
encoding = 'utf-8'
if isinstance(txt, bytes):
2019-03-09 23:06:55 +08:00
# encode_type = chardet.detect(txt)
# encoding = encode_type['encoding']
2019-02-09 12:06:48 +08:00
txt = txt.decode(encoding)
2018-07-01 10:55:30 +08:00
txt_list = re.split('`\d+`', txt)
txt_tag = re.findall('`\d+`', txt)
txt_styled = txt_list[0]
for j, p in enumerate(txt_list[1:]):
style = self._stylesheet[txt_tag[j][1:-1]]
if p and p[-1] == '\n':
2019-02-09 12:06:48 +08:00
txt_styled = txt_styled + style[0] + p.rstrip(
) + style[1] + '\r\n'
2018-07-01 10:55:30 +08:00
else:
txt_styled = txt_styled + style[0] + p + style[1]
2019-02-09 12:06:48 +08:00
return txt_styled.encode(encoding)
2018-07-01 10:55:30 +08:00
def _make_mdx_index(self, db_name):
if os.path.exists(db_name):
os.remove(db_name)
mdx = MDX(self._mdx_file)
self._mdx_db = db_name
2019-03-09 23:06:55 +08:00
returned_index = mdx.get_index(check_block=self._check)
index_list = returned_index['index_dict_list']
conn = sqlite3.connect(db_name)
2018-07-01 10:55:30 +08:00
c = conn.cursor()
2019-03-09 23:06:55 +08:00
c.execute(''' CREATE TABLE MDX_INDEX
2018-07-01 10:55:30 +08:00
(key_text text not null,
file_pos integer,
compressed_size integer,
decompressed_size integer,
record_block_type integer,
record_start integer,
record_end integer,
offset integer
2019-03-09 23:06:55 +08:00
)''')
tuple_list = [(item['key_text'], item['file_pos'],
item['compressed_size'], item['decompressed_size'],
item['record_block_type'], item['record_start'],
item['record_end'], item['offset'])
for item in index_list]
2018-07-01 10:55:30 +08:00
c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
tuple_list)
# build the metadata table
meta = returned_index['meta']
2019-03-09 23:06:55 +08:00
c.execute('''CREATE TABLE META
2018-07-01 10:55:30 +08:00
(key text,
value text
)''')
#for k,v in meta:
# c.execute(
2019-03-09 23:06:55 +08:00
# 'INSERT INTO META VALUES (?,?)',
# (k, v)
# )
2019-03-09 23:06:55 +08:00
c.executemany('INSERT INTO META VALUES (?,?)',
[('encoding', meta['encoding']),
('stylesheet', meta['stylesheet']),
('title', meta['title']),
('description', meta['description']),
('version', version)])
2018-07-01 10:55:30 +08:00
if self._sql_index:
2019-03-09 23:06:55 +08:00
c.execute('''
2018-07-01 10:55:30 +08:00
CREATE INDEX key_index ON MDX_INDEX (key_text)
2019-03-09 23:06:55 +08:00
''')
2018-07-01 10:55:30 +08:00
conn.commit()
conn.close()
#set class member
self._encoding = meta['encoding']
self._stylesheet = json.loads(meta['stylesheet'])
self._title = meta['title']
self._description = meta['description']
def _make_mdd_index(self, db_name):
if os.path.exists(db_name):
os.remove(db_name)
2018-07-01 10:55:30 +08:00
mdd = MDD(self._mdd_file)
self._mdd_db = db_name
2019-03-09 23:06:55 +08:00
index_list = mdd.get_index(check_block=self._check)
conn = sqlite3.connect(db_name)
2018-07-01 10:55:30 +08:00
c = conn.cursor()
2019-03-09 23:06:55 +08:00
c.execute(''' CREATE TABLE MDX_INDEX
2018-07-01 10:55:30 +08:00
(key_text text not null unique,
file_pos integer,
compressed_size integer,
decompressed_size integer,
record_block_type integer,
record_start integer,
record_end integer,
offset integer
2019-03-09 23:06:55 +08:00
)''')
tuple_list = [(item['key_text'], item['file_pos'],
item['compressed_size'], item['decompressed_size'],
item['record_block_type'], item['record_start'],
item['record_end'], item['offset'])
for item in index_list]
2018-07-01 10:55:30 +08:00
c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
tuple_list)
if self._sql_index:
2019-03-09 23:06:55 +08:00
c.execute('''
2018-07-01 10:55:30 +08:00
CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text)
2019-03-09 23:06:55 +08:00
''')
2018-07-01 10:55:30 +08:00
conn.commit()
conn.close()
2018-07-11 07:51:43 +08:00
@staticmethod
def get_data_by_index(fmdx, index):
2018-07-01 10:55:30 +08:00
fmdx.seek(index['file_pos'])
record_block_compressed = fmdx.read(index['compressed_size'])
record_block_type = record_block_compressed[:4]
record_block_type = index['record_block_type']
decompressed_size = index['decompressed_size']
#adler32 = unpack('>I', record_block_compressed[4:8])[0]
if record_block_type == 0:
_record_block = record_block_compressed[8:]
# lzo compression
elif record_block_type == 1:
if lzo is None:
print("LZO compression is not supported")
# decompress
header = b'\xf0' + pack('>I', index['decompressed_size'])
2019-03-09 23:06:55 +08:00
_record_block = lzo.decompress(
record_block_compressed[8:],
initSize=decompressed_size,
blockSize=1308672)
# zlib compression
2018-07-01 10:55:30 +08:00
elif record_block_type == 2:
# decompress
_record_block = zlib.decompress(record_block_compressed[8:])
2019-03-09 23:06:55 +08:00
data = _record_block[index['record_start'] -
index['offset']:index['record_end'] -
index['offset']]
2018-07-11 07:51:43 +08:00
return data
def get_mdx_by_index(self, fmdx, index):
2019-03-09 23:06:55 +08:00
data = self.get_data_by_index(fmdx, index)
record = data.decode(
self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
2018-07-01 10:55:30 +08:00
if self._stylesheet:
record = self._replace_stylesheet(record)
record = record.decode('utf-8')
return record
def get_mdd_by_index(self, fmdx, index):
2019-03-09 23:06:55 +08:00
return self.get_data_by_index(fmdx, index)
2018-07-11 07:51:43 +08:00
@staticmethod
2019-03-09 23:06:55 +08:00
def lookup_indexes(db, keyword, ignorecase=None):
2018-07-11 07:51:43 +08:00
indexes = []
if ignorecase:
2019-03-09 23:06:55 +08:00
sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format(
keyword)
2018-07-11 07:51:43 +08:00
else:
2019-03-09 23:06:55 +08:00
sql = 'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format(
keyword)
2018-07-11 07:51:43 +08:00
with sqlite3.connect(db) as conn:
cursor = conn.execute(sql)
for result in cursor:
index = {}
index['file_pos'] = result[1]
index['compressed_size'] = result[2]
index['decompressed_size'] = result[3]
index['record_block_type'] = result[4]
index['record_start'] = result[5]
index['record_end'] = result[6]
index['offset'] = result[7]
indexes.append(index)
return indexes
2018-07-01 10:55:30 +08:00
2019-03-09 23:06:55 +08:00
def mdx_lookup(self, keyword, ignorecase=None):
2018-07-01 10:55:30 +08:00
lookup_result_list = []
2019-03-09 23:06:55 +08:00
indexes = self.lookup_indexes(self._mdx_db, keyword, ignorecase)
with open(self._mdx_file, 'rb') as mdx_file:
2018-07-11 07:51:43 +08:00
for index in indexes:
2019-03-09 23:06:55 +08:00
lookup_result_list.append(
self.get_mdx_by_index(mdx_file, index))
2018-07-01 10:55:30 +08:00
return lookup_result_list
2018-07-11 07:51:43 +08:00
2019-03-09 23:06:55 +08:00
def mdd_lookup(self, keyword, ignorecase=None):
2018-07-01 10:55:30 +08:00
lookup_result_list = []
2019-03-09 23:06:55 +08:00
indexes = self.lookup_indexes(self._mdd_db, keyword, ignorecase)
with open(self._mdd_file, 'rb') as mdd_file:
2018-07-11 07:51:43 +08:00
for index in indexes:
2019-03-09 23:06:55 +08:00
lookup_result_list.append(
self.get_mdd_by_index(mdd_file, index))
2018-07-01 10:55:30 +08:00
return lookup_result_list
2018-07-11 07:51:43 +08:00
@staticmethod
2019-03-09 23:06:55 +08:00
def get_keys(db, query=''):
2018-07-11 07:51:43 +08:00
if not db:
2018-07-01 10:55:30 +08:00
return []
if query:
if '*' in query:
2019-03-09 23:06:55 +08:00
query = query.replace('*', '%')
2018-07-01 10:55:30 +08:00
else:
query = query + '%'
2018-07-11 07:51:43 +08:00
sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE \"' + query + '\"'
2018-07-01 10:55:30 +08:00
else:
2018-07-11 07:51:43 +08:00
sql = 'SELECT key_text FROM MDX_INDEX'
with sqlite3.connect(db) as conn:
cursor = conn.execute(sql)
2018-07-01 10:55:30 +08:00
keys = [item[0] for item in cursor]
2018-07-11 07:51:43 +08:00
return keys
2019-03-09 23:06:55 +08:00
def get_mdd_keys(self, query=''):
return self.get_keys(self._mdd_db, query)
2018-07-01 10:55:30 +08:00
2019-03-09 23:06:55 +08:00
def get_mdx_keys(self, query=''):
return self.get_keys(self._mdx_db, query)
2018-07-01 10:55:30 +08:00
# mdx_builder = IndexBuilder("oald.mdx")
# text = mdx_builder.mdx_lookup('dedication')
# keys = mdx_builder.get_mdx_keys()
# keys1 = mdx_builder.get_mdx_keys('abstrac')
# keys2 = mdx_builder.get_mdx_keys('*tion')
# for key in keys2:
2019-03-09 23:06:55 +08:00
# text = mdx_builder.mdx_lookup(key)[0]
2018-07-01 10:55:30 +08:00
# pass