658 lines
20 KiB
Python
658 lines
20 KiB
Python
#-*- coding:utf-8 -*-
|
||
import gzip
|
||
import hashlib
|
||
import os
|
||
import re
|
||
import warnings
|
||
from struct import unpack
|
||
|
||
|
||
class _StarDictIfo(object):
|
||
"""
|
||
The .ifo file has the following format:
|
||
|
||
StarDict's dict ifo file
|
||
version=2.4.2
|
||
[options]
|
||
|
||
Note that the current "version" string must be "2.4.2" or "3.0.0". If it's not,
|
||
then StarDict will refuse to read the file.
|
||
If version is "3.0.0", StarDict will parse the "idxoffsetbits" option.
|
||
|
||
[options]
|
||
---------
|
||
In the example above, [options] expands to any of the following lines
|
||
specifying information about the dictionary. Each option is a keyword
|
||
followed by an equal sign, then the value of that option, then a
|
||
newline. The options may be appear in any order.
|
||
|
||
Note that the dictionary must have at least a bookname, a wordcount and a
|
||
idxfilesize, or the load will fail. All other information is optional. All
|
||
strings should be encoded in UTF-8.
|
||
|
||
Available options:
|
||
|
||
bookname= // required
|
||
wordcount= // required
|
||
synwordcount= // required if ".syn" file exists.
|
||
idxfilesize= // required
|
||
idxoffsetbits= // New in 3.0.0
|
||
author=
|
||
email=
|
||
website=
|
||
description= // You can use <br> for new line.
|
||
date=
|
||
sametypesequence= // very important.
|
||
"""
|
||
|
||
def __init__(self, dict_prefix, container):
|
||
|
||
ifo_filename = '%s.ifo' % dict_prefix
|
||
|
||
try:
|
||
_file = open(ifo_filename)
|
||
except Exception as e:
|
||
raise Exception('ifo file opening error: "{}"'.format(e))
|
||
|
||
_file.readline()
|
||
|
||
# skipping ifo header
|
||
_line = _file.readline().split('=')
|
||
if _line[0] == 'version':
|
||
self.version = _line[1]
|
||
else:
|
||
raise Exception('ifo has invalid format')
|
||
|
||
_config = {}
|
||
for _line in _file:
|
||
_line_splited = _line.split('=')
|
||
_config[_line_splited[0]] = _line_splited[1]
|
||
_file.close()
|
||
|
||
self.bookname = _config.get('bookname', None).strip()
|
||
if self.bookname is None:
|
||
raise Exception('ifo has no bookname')
|
||
|
||
self.wordcount = _config.get('wordcount', None)
|
||
if self.wordcount is None:
|
||
raise Exception('ifo has no wordcount')
|
||
self.wordcount = int(self.wordcount)
|
||
|
||
if self.version == '3.0.0':
|
||
try:
|
||
#_syn = open('%s.syn' % dict_prefix) # not used
|
||
self.synwordcount = _config.get('synwordcount', None)
|
||
if self.synwordcount is None:
|
||
raise Exception(
|
||
'ifo has no synwordcount but .syn file exists')
|
||
self.synwordcount = int(self.synwordcount)
|
||
except IOError:
|
||
pass
|
||
|
||
self.idxfilesize = _config.get('idxfilesize', None)
|
||
if self.idxfilesize is None:
|
||
raise Exception('ifo has no idxfilesize')
|
||
self.idxfilesize = int(self.idxfilesize)
|
||
|
||
self.idxoffsetbits = _config.get('idxoffsetbits', 32)
|
||
self.idxoffsetbits = int(self.idxoffsetbits)
|
||
|
||
self.author = _config.get('author', '').strip()
|
||
|
||
self.email = _config.get('email', '').strip()
|
||
|
||
self.website = _config.get('website', '').strip()
|
||
|
||
self.description = _config.get('description', '').strip()
|
||
|
||
self.date = _config.get('date', '').strip()
|
||
|
||
self.sametypesequence = _config.get('sametypesequence', '').strip()
|
||
|
||
|
||
class _StarDictIdx(object):
|
||
"""
|
||
The .idx file is just a word list.
|
||
|
||
The word list is a sorted list of word entries.
|
||
|
||
Each entry in the word list contains three fields, one after the other:
|
||
word_str; // a utf-8 string terminated by '\0'.
|
||
word_data_offset; // word data's offset in .dict file
|
||
word_data_size; // word data's total size in .dict file
|
||
"""
|
||
|
||
def __init__(self, dict_prefix, container):
|
||
self._container = container
|
||
|
||
idx_filename = '%s.idx' % dict_prefix
|
||
idx_filename_gz = '%s.gz' % idx_filename
|
||
|
||
try:
|
||
file = open_file(idx_filename, idx_filename_gz)
|
||
except Exception as e:
|
||
raise Exception('idx file opening error: "{}"'.format(e))
|
||
|
||
self._file = file.read()
|
||
|
||
""" check file size """
|
||
if file.tell() != container.ifo.idxfilesize:
|
||
raise Exception('size of the .idx file is incorrect')
|
||
file.close()
|
||
|
||
""" prepare main dict and parsing parameters """
|
||
self._idx = {}
|
||
idx_offset_bytes_size = int(container.ifo.idxoffsetbits / 8)
|
||
idx_offset_format = {4: 'L', 8: 'Q', }[idx_offset_bytes_size]
|
||
idx_cords_bytes_size = idx_offset_bytes_size + 4
|
||
|
||
""" parse data via regex """
|
||
record_pattern = br'([\d\D]+?\x00[\d\D]{' + str(
|
||
idx_cords_bytes_size).encode('utf-8') + br'})'
|
||
matched_records = re.findall(record_pattern, self._file)
|
||
|
||
""" check records count """
|
||
if len(matched_records) != container.ifo.wordcount:
|
||
raise Exception('words count is incorrect')
|
||
|
||
""" unpack parsed records """
|
||
for matched_record in matched_records:
|
||
c = matched_record.find(b'\x00')
|
||
if c == 0:
|
||
continue
|
||
record_tuple = unpack(
|
||
'!%sc%sL' % (c + 1, idx_offset_format), matched_record)
|
||
word, cords = record_tuple[:c], record_tuple[c + 1:]
|
||
self._idx[b''.join(word)] = cords
|
||
|
||
def __getitem__(self, word):
|
||
"""
|
||
returns tuple (word_data_offset, word_data_size,) for word in .dict
|
||
|
||
@note: here may be placed flexible search realization
|
||
"""
|
||
return self._idx[word.encode('utf-8')]
|
||
|
||
def __contains__(self, k):
|
||
"""
|
||
returns True if index has a word k, else False
|
||
"""
|
||
return k.encode('utf-8') in self._idx
|
||
|
||
def __eq__(self, y):
|
||
"""
|
||
returns True if hashlib.md5(x.idx) is equal to hashlib.md5(y.idx), else False
|
||
"""
|
||
return hashlib.md5(self._file).hexdigest() == hashlib.md5(y._file).hexdigest()
|
||
|
||
def __ne__(self, y):
|
||
"""
|
||
returns True if hashlib.md5(x.idx) is not equal to hashlib.md5(y.idx), else False
|
||
"""
|
||
return not self.__eq__(y)
|
||
|
||
def iterkeys(self):
|
||
"""
|
||
returns iterkeys
|
||
"""
|
||
if not self._container.in_memory:
|
||
warnings.warn(
|
||
'Iter dict items with in_memory=False may cause serious performance problem')
|
||
for key in self._idx.iterkeys():
|
||
yield key.decode('utf-8')
|
||
|
||
def keys(self):
|
||
"""
|
||
returns keys
|
||
"""
|
||
if sys.version_info[0] == 3:
|
||
return self.iterkeys()
|
||
|
||
if not self._container.in_memory:
|
||
warnings.warn(
|
||
'Iter dict items with in_memory=False may cause serious performance problem')
|
||
return [key.decode('utf-8') for key in self._idx.keys()]
|
||
|
||
|
||
class _StarDictDict(object):
|
||
"""
|
||
The .dict file is a pure data sequence, as the offset and size of each
|
||
word is recorded in the corresponding .idx file.
|
||
|
||
If the "sametypesequence" option is not used in the .ifo file, then
|
||
the .dict file has fields in the following order:
|
||
==============
|
||
word_1_data_1_type; // a single char identifying the data type
|
||
word_1_data_1_data; // the data
|
||
word_1_data_2_type;
|
||
word_1_data_2_data;
|
||
...... // the number of data entries for each word is determined by
|
||
// word_data_size in .idx file
|
||
word_2_data_1_type;
|
||
word_2_data_1_data;
|
||
......
|
||
==============
|
||
It's important to note that each field in each word indicates its
|
||
own length, as described below. The number of possible fields per
|
||
word is also not fixed, and is determined by simply reading data until
|
||
you've read word_data_size bytes for that word.
|
||
|
||
|
||
Suppose the "sametypesequence" option is used in the .idx file, and
|
||
the option is set like this:
|
||
sametypesequence=tm
|
||
Then the .dict file will look like this:
|
||
==============
|
||
word_1_data_1_data
|
||
word_1_data_2_data
|
||
word_2_data_1_data
|
||
word_2_data_2_data
|
||
......
|
||
==============
|
||
The first data entry for each word will have a terminating '\0', but
|
||
the second entry will not have a terminating '\0'. The omissions of
|
||
the type chars and of the last field's size information are the
|
||
optimizations required by the "sametypesequence" option described
|
||
above.
|
||
|
||
If "idxoffsetbits=64", the file size of the .dict file will be bigger
|
||
than 4G. Because we often need to mmap this large file, and there is
|
||
a 4G maximum virtual memory space limit in a process on the 32 bits
|
||
computer, which will make we can get error, so "idxoffsetbits=64"
|
||
dictionary can't be loaded in 32 bits machine in fact, StarDict will
|
||
simply print a warning in this case when loading. 64-bits computers
|
||
should haven't this limit.
|
||
|
||
Type identifiers
|
||
----------------
|
||
Here are the single-character type identifiers that may be used with
|
||
the "sametypesequence" option in the .idx file, or may appear in the
|
||
dict file itself if the "sametypesequence" option is not used.
|
||
|
||
Lower-case characters signify that a field's size is determined by a
|
||
terminating '\0', while upper-case characters indicate that the data
|
||
begins with a network byte-ordered guint32 that gives the length of
|
||
the following data's size(NOT the whole size which is 4 bytes bigger).
|
||
|
||
'm'
|
||
Word's pure text meaning.
|
||
The data should be a utf-8 string ending with '\0'.
|
||
|
||
'l'
|
||
Word's pure text meaning.
|
||
The data is NOT a utf-8 string, but is instead a string in locale
|
||
encoding, ending with '\0'. Sometimes using this type will save disk
|
||
space, but its use is discouraged.
|
||
|
||
'g'
|
||
A utf-8 string which is marked up with the Pango text markup language.
|
||
For more information about this markup language, See the "Pango
|
||
Reference Manual."
|
||
You might have it installed locally at:
|
||
file:///usr/share/gtk-doc/html/pango/PangoMarkupFormat.html
|
||
|
||
't'
|
||
English phonetic string.
|
||
The data should be a utf-8 string ending with '\0'.
|
||
|
||
Here are some utf-8 phonetic characters:
|
||
θʃŋʧðʒæıʌʊɒɛəɑɜɔˌˈːˑṃṇḷ
|
||
æɑɒʌәєŋvθðʃʒɚːɡˏˊˋ
|
||
|
||
'x'
|
||
A utf-8 string which is marked up with the xdxf language.
|
||
See http://xdxf.sourceforge.net
|
||
StarDict have these extention:
|
||
<rref> can have "type" attribute, it can be "image", "sound", "video"
|
||
and "attach".
|
||
<kref> can have "k" attribute.
|
||
|
||
'y'
|
||
Chinese YinBiao or Japanese KANA.
|
||
The data should be a utf-8 string ending with '\0'.
|
||
|
||
'k'
|
||
KingSoft PowerWord's data. The data is a utf-8 string ending with '\0'.
|
||
It is in XML format.
|
||
|
||
'w'
|
||
MediaWiki markup language.
|
||
See http://meta.wikimedia.org/wiki/Help:Editing#The_wiki_markup
|
||
|
||
'h'
|
||
Html codes.
|
||
|
||
'r'
|
||
Resource file list.
|
||
The content can be:
|
||
img:pic/example.jpg // Image file
|
||
snd:apple.wav // Sound file
|
||
vdo:film.avi // Video file
|
||
att:file.bin // Attachment file
|
||
More than one line is supported as a list of available files.
|
||
StarDict will find the files in the Resource Storage.
|
||
The image will be shown, the sound file will have a play button.
|
||
You can "save as" the attachment file and so on.
|
||
|
||
'W'
|
||
wav file.
|
||
The data begins with a network byte-ordered guint32 to identify the wav
|
||
file's size, immediately followed by the file's content.
|
||
|
||
'P'
|
||
Picture file.
|
||
The data begins with a network byte-ordered guint32 to identify the picture
|
||
file's size, immediately followed by the file's content.
|
||
|
||
'X'
|
||
this type identifier is reserved for experimental extensions.
|
||
|
||
"""
|
||
|
||
def __init__(self, dict_prefix, container, in_memory=False):
|
||
"""
|
||
opens regular or dziped .dict file
|
||
|
||
'in_memory': indicate whether read whole dict file into memory
|
||
"""
|
||
self._container = container
|
||
self._in_memory = in_memory
|
||
|
||
dict_filename = '%s.dict' % dict_prefix
|
||
dict_filename_dz = '%s.dz' % dict_filename
|
||
|
||
try:
|
||
f = open_file(dict_filename, dict_filename_dz)
|
||
except Exception as e:
|
||
raise Exception('dict file opening error: "{}"'.format(e))
|
||
|
||
if in_memory:
|
||
self._file = f.read()
|
||
f.close()
|
||
else:
|
||
self._file = f
|
||
|
||
def __getitem__(self, word):
|
||
"""
|
||
returns data from .dict for word
|
||
"""
|
||
|
||
# getting word data coordinates
|
||
cords = self._container.idx[word]
|
||
|
||
if self._in_memory:
|
||
bytes_ = self._file[cords[0]: cords[0] + cords[1]]
|
||
else:
|
||
# seeking in file for data
|
||
self._file.seek(cords[0])
|
||
|
||
# reading data
|
||
bytes_ = self._file.read(cords[1])
|
||
|
||
return bytes_.decode('utf-8')
|
||
|
||
|
||
class _StarDictSyn(object):
|
||
|
||
def __init__(self, dict_prefix, container):
|
||
|
||
syn_filename = '%s.syn' % dict_prefix
|
||
|
||
try:
|
||
self._file = open(syn_filename)
|
||
except IOError:
|
||
# syn file is optional, passing silently
|
||
pass
|
||
|
||
|
||
class Dictionary(dict):
|
||
"""
|
||
Dictionary-like class for lazy manipulating stardict dictionaries
|
||
|
||
All items of this dictionary are writable and dict is expandable itself,
|
||
but changes are not stored anywhere and available in runtime only.
|
||
|
||
We assume in this documentation that "x" or "y" is instances of the
|
||
StarDictDict class and "x.{ifo,idx{,.gz},dict{,.dz},syn}" or
|
||
"y.{ifo,idx{,.gz},dict{,.dz},syn}" is files of the corresponding stardict
|
||
dictionaries.
|
||
|
||
|
||
Following documentation is from the "dict" class an is subkect to rewrite
|
||
in further impleneted methods:
|
||
|
||
"""
|
||
|
||
def __init__(self, filename_prefix, in_memory=False):
|
||
"""
|
||
filename_prefix: path to dictionary files without files extensions
|
||
|
||
initializes new StarDictDict instance from stardict dictionary files
|
||
provided by filename_prefix
|
||
"""
|
||
|
||
self.in_memory = in_memory
|
||
|
||
# reading somedict.ifo
|
||
self.ifo = _StarDictIfo(dict_prefix=filename_prefix, container=self)
|
||
|
||
# reading somedict.idx or somedict.idx.gz
|
||
self.idx = _StarDictIdx(dict_prefix=filename_prefix, container=self)
|
||
|
||
# reading somedict.dict or somedict.dict.dz
|
||
self.dict = _StarDictDict(
|
||
dict_prefix=filename_prefix, container=self, in_memory=in_memory)
|
||
|
||
# reading somedict.syn (optional)
|
||
self.syn = _StarDictSyn(dict_prefix=filename_prefix, container=self)
|
||
|
||
# initializing cache
|
||
self._dict_cache = {}
|
||
|
||
def __cmp__(self, y):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def __contains__(self, k):
|
||
"""
|
||
returns True if x.idx has a word k, else False
|
||
"""
|
||
return k in self.idx
|
||
|
||
def __delitem__(self, k):
|
||
"""
|
||
frees cache from word k translation
|
||
"""
|
||
del self._dict_cache[k]
|
||
|
||
def __eq__(self, y):
|
||
"""
|
||
returns True if hashlib.md5(x.idx) is equal to hashlib.md5(y.idx), else False
|
||
"""
|
||
return self.idx.__eq__(y.idx)
|
||
|
||
def __ge__(self, y):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def __getitem__(self, k):
|
||
"""
|
||
returns translation for word k from cache or not and then caches
|
||
"""
|
||
if k in self._dict_cache:
|
||
return self._dict_cache[k]
|
||
else:
|
||
value = self.dict[k]
|
||
self._dict_cache[k] = value
|
||
return value
|
||
|
||
def __gt__(self, y):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def __iter__(self):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def __le__(self):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def __len__(self):
|
||
"""
|
||
returns number of words provided by wordcount parameter of the x.ifo
|
||
"""
|
||
return self.ifo.wordcount
|
||
|
||
def __lt__(self):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def __ne__(self, y):
|
||
"""
|
||
returns True if hashlib.md5(x.idx) is not equal to hashlib.md5(y.idx), else False
|
||
"""
|
||
return not self.__eq__(y)
|
||
|
||
def __repr__(self):
|
||
"""
|
||
returns classname and bookname parameter of the x.ifo
|
||
"""
|
||
return u'%s %s' % (self.__class__, self.ifo.bookname)
|
||
|
||
def __setitem__(self, k, v):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def clear(self):
|
||
"""
|
||
clear dict cache
|
||
"""
|
||
self._dict_cache = dict()
|
||
|
||
def get(self, k, d=''):
|
||
"""
|
||
returns translation of the word k from self.dict or d if k not in x.idx
|
||
|
||
d defaults to empty string
|
||
"""
|
||
return k in self and self[k] or d
|
||
|
||
def has_key(self, k):
|
||
"""
|
||
returns True if self.idx has a word k, else False
|
||
"""
|
||
return k in self
|
||
|
||
def items(self):
|
||
"""
|
||
returns items
|
||
"""
|
||
if not self.in_memory:
|
||
warnings.warn(
|
||
'Iter dict items with in_memory=False may cause serious performance problem')
|
||
return [(key, self[key]) for key in self.keys()]
|
||
|
||
def iteritems(self):
|
||
"""
|
||
returns iteritems
|
||
"""
|
||
if not self.in_memory:
|
||
warnings.warn(
|
||
'Iter dict items with in_memory=False may cause serious performance problem')
|
||
for key in self.iterkeys():
|
||
yield (key, self[key])
|
||
|
||
def iterkeys(self):
|
||
"""
|
||
returns iterkeys
|
||
"""
|
||
if not self.in_memory:
|
||
warnings.warn(
|
||
'Iter dict items with in_memory=False may cause serious performance problem')
|
||
return self.idx.iterkeys()
|
||
|
||
def itervalues(self):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def keys(self):
|
||
"""
|
||
returns keys
|
||
"""
|
||
if not self.in_memory:
|
||
warnings.warn(
|
||
'Iter dict items with in_memory=False may cause serious performance problem')
|
||
return self.idx.keys()
|
||
|
||
def pop(self, k, d):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def popitem(self):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def setdefault(self, k, d):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def update(self, E, **F):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def values(self):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
def fromkeys(self, S, v=None):
|
||
"""
|
||
raises NotImplemented exception
|
||
"""
|
||
raise NotImplementedError()
|
||
|
||
|
||
def open_file(regular, gz):
|
||
"""
|
||
Open regular file if it exists, gz file otherwise.
|
||
If no file exists, raise ValueError.
|
||
"""
|
||
if os.path.exists(regular):
|
||
try:
|
||
return open(regular, 'rb')
|
||
except Exception as e:
|
||
raise Exception('regular file opening error: "{}"'.format(e))
|
||
|
||
if os.path.exists(gz):
|
||
try:
|
||
return gzip.open(gz, 'rb')
|
||
except Exception as e:
|
||
raise Exception('gz file opening error: "{}"'.format(e))
|
||
|
||
raise ValueError('Neither regular nor gz file exists') |