anki-word-query/src/fastwq/service/base.py

730 lines
24 KiB
Python
Raw Normal View History

2018-07-01 10:55:30 +08:00
#-*- coding:utf-8 -*-
#
2018-07-07 17:48:15 +08:00
# Copyright © 20162017 sthoo <sth201807@gmail.com>
2018-07-01 10:55:30 +08:00
#
# Support: Report an issue at https://github.com/sth2018/FastWordQuery/issues
2018-07-01 10:55:30 +08:00
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version; http://www.gnu.org/copyleft/gpl.html.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import inspect
import os
2018-07-13 22:08:35 +08:00
import sys
import types
2018-07-01 10:55:30 +08:00
# use ntpath module to ensure the windows-style (e.g. '\\LDOCE.css')
# path can be processed on Unix platform.
# However, anki version on mac platforms doesn't including this package?
# import ntpath
import re
import shutil
import sqlite3
import urllib
import urllib2
import zlib
import random
2018-07-01 10:55:30 +08:00
from collections import defaultdict
from functools import wraps
2018-07-09 14:12:54 +08:00
from hashlib import md5
2018-07-01 10:55:30 +08:00
import cookielib
from aqt import mw
from aqt.qt import QThread, QMutex
2018-07-01 10:55:30 +08:00
from ..context import config
from ..libs import MdxBuilder, StardictBuilder
from ..utils import MapDict, wrap_css
from ..libs.bs4 import BeautifulSoup
from ..lang import _cl
try:
import threading as _threading
except ImportError:
import dummy_threading as _threading
2018-07-01 10:55:30 +08:00
2018-07-13 22:08:35 +08:00
__all__ = [
'register', 'export', 'copy_static_file', 'with_styles', 'parse_html', 'service_wrap',
'Service', 'WebService', 'LocalService', 'MdxService', 'StardictService', 'QueryResult'
]
def register(labels):
"""
register the dict service with a labels, which will be shown in the dicts list.
"""
2018-07-01 10:55:30 +08:00
def _deco(cls):
cls.__register_label__ = _cl(labels)
2018-07-13 22:08:35 +08:00
methods = inspect.getmembers(cls, predicate=inspect.ismethod)
exports = []
for method in methods:
attrs = getattr(method[1], '__export_attrs__', None)
if attrs and attrs[1] == -1:
exports.append((
getattr(method[1], '__def_index__', 0),
method[1]
))
exports = sorted(exports)
for index, method in enumerate(exports):
attrs = getattr(method[1], '__export_attrs__', None)
attrs[1] = index
2018-07-01 10:55:30 +08:00
return cls
return _deco
2018-07-13 22:08:35 +08:00
def export(labels):
"""
export dict field function with a labels, which will be shown in the fields list.
"""
2018-07-01 10:55:30 +08:00
def _with(fld_func):
@wraps(fld_func)
2018-07-13 22:08:35 +08:00
def _deco(self, *args, **kwargs):
res = fld_func(self, *args, **kwargs)
2018-07-01 10:55:30 +08:00
return QueryResult(result=res) if not isinstance(res, QueryResult) else res
2018-07-13 22:08:35 +08:00
_deco.__export_attrs__ = [_cl(labels), -1]
_deco.__def_index__ = export.EXPORT_INDEX
export.EXPORT_INDEX += 1
2018-07-01 10:55:30 +08:00
return _deco
return _with
2018-07-13 22:08:35 +08:00
export.EXPORT_INDEX = 0
2018-07-01 10:55:30 +08:00
def copy_static_file(filename, new_filename=None, static_dir='static'):
"""
copy file in static directory to media folder
"""
abspath = os.path.join(os.path.dirname(os.path.realpath(__file__)),
static_dir,
filename)
shutil.copy(abspath, new_filename if new_filename else filename)
def with_styles(**styles):
"""
cssfile: specify the css file in static folder
css: css strings
js: js strings
jsfile: specify the js file in static folder
"""
def _with(fld_func):
@wraps(fld_func)
def _deco(cls, *args, **kwargs):
res = fld_func(cls, *args, **kwargs)
cssfile, css, jsfile, js, need_wrap_css, class_wrapper =\
styles.get('cssfile', None),\
styles.get('css', None),\
styles.get('jsfile', None),\
styles.get('js', None),\
styles.get('need_wrap_css', False),\
styles.get('wrap_class', '')
def wrap(html, css_obj, is_file=True):
# wrap css and html
if need_wrap_css and class_wrapper:
html = u'<div class="{}">{}</div>'.format(
class_wrapper, html)
return html, wrap_css(css_obj, is_file=is_file, class_wrapper=class_wrapper)[0]
return html, css_obj
if cssfile:
new_cssfile = cssfile if cssfile.startswith('_') \
else u'_' + cssfile
# copy the css file to media folder
copy_static_file(cssfile, new_cssfile)
# wrap the css file
res, new_cssfile = wrap(res, new_cssfile)
res = u'<link type="text/css" rel="stylesheet" href="{0}" />{1}'.format(
new_cssfile, res)
if css:
res, css = wrap(res, css, is_file=False)
res = u'<style>{0}</style>{1}'.format(css, res)
if not isinstance(res, QueryResult):
return QueryResult(result=res, jsfile=jsfile, js=js)
else:
res.set_styles(jsfile=jsfile, js=js)
return res
return _deco
return _with
# bs4 threading lock, overload protection
2018-07-13 22:08:35 +08:00
_BS_LOCKS = [_threading.Lock(), _threading.Lock()]
2018-07-13 22:08:35 +08:00
def parse_html(html):
'''
use bs4 lib parse HTML, run only 2 BS at the same time
'''
2018-07-13 22:08:35 +08:00
lock = _BS_LOCKS[random.randrange(0, len(_BS_LOCKS) - 1, 1)]
lock.acquire()
soup = BeautifulSoup(html, 'html.parser')
lock.release()
return soup
2018-07-01 10:55:30 +08:00
def service_wrap(service, *args, **kwargs):
"""
wrap the service class constructor
"""
def _service():
return service(*args, **kwargs)
return _service
2018-07-01 10:55:30 +08:00
class Service(object):
'''
Dictionary Service Abstract Class
'''
2018-07-01 10:55:30 +08:00
def __init__(self):
self.cache = defaultdict(defaultdict)
2018-07-13 22:08:35 +08:00
self._exporters = self._get_exporters()
2018-07-01 10:55:30 +08:00
self._fields, self._actions = zip(*self._exporters) \
if self._exporters else (None, None)
# query interval: default 500ms
self.query_interval = 0.5
def cache_this(self, result):
self.cache[self.word].update(result)
return result
def cached(self, key):
return (self.word in self.cache) and self.cache[self.word].has_key(key)
def cache_result(self, key):
return self.cache[self.word].get(key, u'')
@property
def support(self):
return True
2018-07-01 10:55:30 +08:00
@property
def fields(self):
return self._fields
@property
def actions(self):
return self._actions
@property
def exporters(self):
return self._exporters
2018-07-13 22:08:35 +08:00
def _get_exporters(self):
2018-07-01 10:55:30 +08:00
flds = dict()
methods = inspect.getmembers(self, predicate=inspect.ismethod)
for method in methods:
export_attrs = getattr(method[1], '__export_attrs__', None)
if export_attrs:
2018-07-13 22:08:35 +08:00
label, index = export_attrs[0], export_attrs[1]
2018-07-01 10:55:30 +08:00
flds.update({int(index): (label, method[1])})
sorted_flds = sorted(flds)
return [flds[key] for key in sorted_flds]
2018-07-13 22:08:35 +08:00
def active(self, fld_ord, word):
2018-07-01 10:55:30 +08:00
self.word = word
2018-07-13 22:08:35 +08:00
if fld_ord >= 0 and fld_ord < len(self.actions):
return self.actions[fld_ord]()
2018-07-01 10:55:30 +08:00
return QueryResult.default()
@staticmethod
def get_anki_label(filename, type_):
formats = {'audio': u'[sound:{0}]',
'img': u'<img src="{0}">',
'video': u'<video controls="controls" width="100%" height="auto" src="{0}"></video>'}
return formats[type_].format(filename)
class WebService(Service):
"""
Web Dictionary Service
"""
2018-07-01 10:55:30 +08:00
def __init__(self):
super(WebService, self).__init__()
self._cookie = cookielib.CookieJar()
self._opener = urllib2.build_opener(
urllib2.HTTPCookieProcessor(self._cookie))
self.query_interval = 1.0
2018-07-01 10:55:30 +08:00
@property
def title(self):
2018-07-14 01:18:03 +08:00
return getattr(self, '__register_label__', self.unique)
2018-07-01 10:55:30 +08:00
@property
def unique(self):
return self.__class__.__name__
def get_response(self, url, data=None, headers=None, timeout=10):
default_headers = {'User-Agent': 'Anki WordQuery',
'Accept-Encoding': 'gzip'}
if headers:
default_headers.update(headers)
request = urllib2.Request(url, headers=default_headers)
try:
response = self._opener.open(request, data=data, timeout=timeout)
data = response.read()
if response.info().get('Content-Encoding') == 'gzip':
data = zlib.decompress(data, 16 + zlib.MAX_WBITS)
return data
except:
return ''
@classmethod
2018-07-09 14:12:54 +08:00
def download(cls, url, filename, timeout=15):
import socket
socket.setdefaulttimeout(timeout)
2018-07-01 10:55:30 +08:00
try:
return urllib.urlretrieve(url, filename)
except Exception as e:
pass
2018-07-07 22:51:01 +08:00
class TinyDownloadError(ValueError):
"""Raises when a download is too small."""
def net_stream(self, targets, require=None, method='GET',
awesome_ua=False, add_padding=False,
custom_quoter=None, custom_headers=None):
"""
Returns the raw payload string from the specified target(s).
If multiple targets are specified, their resulting payloads are
glued together.
Each "target" is a bare URL string or a tuple containing an
address and a dict for what to tack onto the query string.
Finally, a require dict may be passed to enforce a Content-Type
using key 'mime' and/or a minimum payload size using key 'size'.
If using multiple targets, these requirements apply to each
response.
The underlying library here already understands how to search
the environment for proxy settings (e.g. HTTP_PROXY), so we do
not need to do anything extra for that.
If add_padding is True, then some additional null padding will
be added onto the stream returned. This is helpful for some web
services that sometimes return MP3s that `mplayer` clips early.
"""
DEFAULT_UA = 'Mozilla/5.0'
DEFAULT_TIMEOUT = 3
PADDING = '\0' * 2**11
assert method in ['GET', 'POST'], "method must be GET or POST"
from urllib2 import urlopen, Request, quote
targets = targets if isinstance(targets, list) else [targets]
targets = [
(target, None) if isinstance(target, basestring)
else (
target[0],
'&'.join(
'='.join([
key,
(
custom_quoter[key] if (custom_quoter and
key in custom_quoter)
else quote
)(
val.encode('utf-8') if isinstance(val, unicode)
else val if isinstance(val, str)
else str(val),
safe='',
),
])
for key, val in target[1].items()
),
)
for target in targets
]
require = require or {}
payloads = []
for number, (url, params) in enumerate(targets, 1):
desc = "web request" if len(targets) == 1 \
else "web request (%d of %d)" % (number, len(targets))
headers = {'User-Agent': DEFAULT_UA}
if custom_headers:
headers.update(custom_headers)
response = urlopen(
Request(
url=('?'.join([url, params]) if params and method == 'GET'
else url),
headers=headers,
),
data=params if params and method == 'POST' else None,
timeout=DEFAULT_TIMEOUT,
)
if not response:
raise IOError("No response for %s" % desc)
if response.getcode() != 200:
value_error = ValueError(
"Got %d status for %s" %
(response.getcode(), desc)
)
try:
value_error.payload = response.read()
response.close()
except StandardError:
pass
raise value_error
if 'mime' in require and \
require['mime'] != format(response.info().
gettype()).replace('/x-', '/'):
value_error = ValueError(
"Request got %s Content-Type for %s; wanted %s" %
(response.info().gettype(), desc, require['mime'])
)
value_error.got_mime = response.info().gettype()
value_error.wanted_mime = require['mime']
raise value_error
payload = response.read()
response.close()
if 'size' in require and len(payload) < require['size']:
raise self.TinyDownloadError(
"Request got %d-byte stream for %s; wanted %d+ bytes" %
(len(payload), desc, require['size'])
)
payloads.append(payload)
if add_padding:
payloads.append(PADDING)
return ''.join(payloads)
def net_download(self, path, *args, **kwargs):
"""
Downloads a file to the given path from the specified target(s).
See net_stream() for information about available options.
"""
payload = self.net_stream(*args, **kwargs)
with open(path, 'wb') as response_output:
response_output.write(payload)
2018-07-01 10:55:30 +08:00
2018-07-09 14:12:54 +08:00
class _DictBuildWorker(QThread):
"""Local Dictionary Builder"""
def __init__(self, func):
2018-07-09 14:12:54 +08:00
super(_DictBuildWorker, self).__init__()
self._builder = None
self._func = func
def run(self):
2018-07-10 23:07:07 +08:00
try:
self._builder = self._func()
except Exception:
self._builder = None
2018-07-09 14:12:54 +08:00
@property
def builder(self):
return self._builder
2018-07-01 10:55:30 +08:00
class LocalService(Service):
"""
Local Dictionary Service
"""
2018-07-01 10:55:30 +08:00
def __init__(self, dict_path):
super(LocalService, self).__init__()
self.dict_path = dict_path
2018-07-14 01:18:03 +08:00
self._unique = md5(dict_path).hexdigest()
2018-07-01 10:55:30 +08:00
self.builder = None
self.missed_css = set()
2018-07-09 14:12:54 +08:00
# MdxBuilder instances map
_mdx_builders = defaultdict(dict)
_mutex_builder = QMutex()
@staticmethod
def _get_builer(key, func=None):
LocalService._mutex_builder.lock()
key = md5(key).hexdigest()
if not func is None:
if not LocalService._mdx_builders.has_key(key) or not LocalService._mdx_builders[key]:
worker = _DictBuildWorker(func)
worker.start()
while not worker.isFinished():
mw.app.processEvents()
worker.wait(100)
2018-07-09 14:12:54 +08:00
LocalService._mdx_builders[key] = worker.builder
LocalService._mutex_builder.unlock()
return LocalService._mdx_builders[key]
2018-07-01 10:55:30 +08:00
@property
def support(self):
return os.path.isfile(self.dict_path)
2018-07-01 10:55:30 +08:00
@property
def unique(self):
2018-07-14 01:18:03 +08:00
return self._unique
2018-07-01 10:55:30 +08:00
@property
def title(self):
2018-07-14 01:18:03 +08:00
return getattr(self, '__register_label__', u'Unkown')
2018-07-01 10:55:30 +08:00
@property
def _filename(self):
return os.path.splitext(os.path.basename(self.dict_path))[0]
2018-07-13 22:08:35 +08:00
def active(self, fld_ord, word):
self.missed_css.clear()
2018-07-13 22:08:35 +08:00
return super(LocalService, self).active(fld_ord, word)
2018-07-01 10:55:30 +08:00
class MdxService(LocalService):
"""
MDX Local Dictionary Service
"""
2018-07-01 10:55:30 +08:00
def __init__(self, dict_path):
super(MdxService, self).__init__(dict_path)
2018-07-06 22:37:00 +08:00
self.media_cache = defaultdict(set)
self.cache = defaultdict(str)
self.html_cache = defaultdict(str)
2018-07-01 10:55:30 +08:00
self.query_interval = 0.01
self.styles = []
2018-07-10 23:07:07 +08:00
if MdxService.check(self.dict_path):
2018-07-09 14:12:54 +08:00
self.builder = self._get_builer(dict_path, service_wrap(MdxBuilder, dict_path))
2018-07-01 10:55:30 +08:00
2018-07-06 22:37:00 +08:00
@staticmethod
def check(dict_path):
return os.path.isfile(dict_path) and dict_path.lower().endswith('.mdx')
@property
def support(self):
2018-07-10 23:07:07 +08:00
return self.builder and MdxService.check(self.dict_path)
2018-07-01 10:55:30 +08:00
@property
def title(self):
if config.use_filename or not self.builder._title or self.builder._title.startswith('Title'):
return self._filename
else:
2018-07-08 00:53:29 +08:00
return self.builder._title
2018-07-01 10:55:30 +08:00
2018-07-13 22:08:35 +08:00
@export([u'默认', u'Default'])
2018-07-01 10:55:30 +08:00
def fld_whole(self):
2018-07-06 22:37:00 +08:00
html = self.get_default_html()
2018-07-01 10:55:30 +08:00
js = re.findall(r'<script.*?>.*?</script>', html, re.DOTALL)
return QueryResult(result=html, js=u'\n'.join(js))
def _get_definition_mdx(self):
"""according to the word return mdx dictionary page"""
content = self.builder.mdx_lookup(self.word)
str_content = ""
if len(content) > 0:
for c in content:
str_content += c.replace("\r\n","").replace("entry:/","")
return str_content
def _get_definition_mdd(self, word):
"""according to the keyword(param word) return the media file contents"""
word = word.replace('/', '\\')
content = self.builder.mdd_lookup(word)
if len(content) > 0:
return [content[0]]
else:
return []
2018-07-01 10:55:30 +08:00
def get_html(self):
"""get self.word's html page from MDX"""
if not self.html_cache[self.word]:
html = self._get_definition_mdx()
if html:
self.html_cache[self.word] = html
return self.html_cache[self.word]
def save_file(self, filepath_in_mdx, savepath):
"""according to filepath_in_mdx to get media file and save it to savepath"""
2018-07-01 10:55:30 +08:00
try:
bytes_list = self._get_definition_mdd(filepath_in_mdx)
if bytes_list:
if not os.path.exists(savepath):
with open(savepath, 'wb') as f:
f.write(bytes_list[0])
return savepath
2018-07-01 10:55:30 +08:00
except sqlite3.OperationalError as e:
#showInfo(str(e))
2018-07-01 10:55:30 +08:00
pass
return ''
2018-07-01 10:55:30 +08:00
2018-07-06 22:37:00 +08:00
def get_default_html(self):
'''
default get html from mdx interface
'''
if not self.cache[self.word]:
html = ''
result = self.builder.mdx_lookup(self.word) # self.word: unicode
if result:
if result[0].upper().find(u"@@@LINK=") > -1:
# redirect to a new word behind the equal symol.
self.word = result[0][len(u"@@@LINK="):].strip()
return self.get_default_html()
else:
html = self.adapt_to_anki(result[0])
self.cache[self.word] = html
return self.cache[self.word]
def adapt_to_anki(self, html):
"""
1. convert the media path to actual path in anki's collection media folder.
2. remove the js codes (js inside will expires.)
"""
# convert media path, save media files
media_files_set = set()
mcss = re.findall(r'href="(\S+?\.css)"', html)
media_files_set.update(set(mcss))
mjs = re.findall(r'src="([\w\./]\S+?\.js)"', html)
media_files_set.update(set(mjs))
msrc = re.findall(r'<img.*?src="([\w\./]\S+?)".*?>', html)
media_files_set.update(set(msrc))
msound = re.findall(r'href="sound:(.*?\.(?:mp3|wav))"', html)
if config.export_media:
media_files_set.update(set(msound))
for each in media_files_set:
html = html.replace(each, u'_' + each.split('/')[-1])
# find sounds
p = re.compile(
r'<a[^>]+?href=\"(sound:_.*?\.(?:mp3|wav))\"[^>]*?>(.*?)</a>')
html = p.sub(u"[\\1]\\2", html)
self.save_media_files(media_files_set)
for f in mcss:
cssfile = u'_{}'.format(os.path.basename(f.replace('\\', os.path.sep)))
2018-07-06 22:37:00 +08:00
# if not exists the css file, the user can place the file to media
# folder first, and it will also execute the wrap process to generate
# the desired file.
if not os.path.exists(cssfile):
css_src = self.dict_path.replace(self._filename+u'.mdx', f)
if os.path.exists(css_src):
shutil.copy(css_src, cssfile)
else:
self.missed_css.add(cssfile[1:])
2018-07-06 22:37:00 +08:00
new_css_file, wrap_class_name = wrap_css(cssfile)
html = html.replace(cssfile, new_css_file)
# add global div to the result html
html = u'<div class="{0}">{1}</div>'.format(
wrap_class_name, html)
return html
def save_default_file(self, filepath_in_mdx, savepath=None):
'''
default save file interface
'''
basename = os.path.basename(filepath_in_mdx.replace('\\', os.path.sep))
if savepath is None:
savepath = '_' + basename
try:
bytes_list = self.builder.mdd_lookup(filepath_in_mdx)
if bytes_list and not os.path.exists(savepath):
with open(savepath, 'wb') as f:
f.write(bytes_list[0])
return savepath
except sqlite3.OperationalError as e:
pass
def save_media_files(self, data):
"""
get the necessary static files from local mdx dictionary
** kwargs: data = list
"""
diff = data.difference(self.media_cache['files'])
self.media_cache['files'].update(diff)
lst, errors = list(), list()
wild = [
'*' + os.path.basename(each.replace('\\', os.path.sep)) for each in diff]
try:
for each in wild:
keys = self.builder.get_mdd_keys(each)
if not keys:
errors.append(each)
lst.extend(keys)
for each in lst:
self.save_default_file(each)
except AttributeError:
pass
return errors
2018-07-01 10:55:30 +08:00
class StardictService(LocalService):
2018-07-13 22:08:35 +08:00
'''
Stardict Local Dictionary Service
'''
2018-07-01 10:55:30 +08:00
def __init__(self, dict_path):
super(StardictService, self).__init__(dict_path)
self.query_interval = 0.05
2018-07-10 23:07:07 +08:00
if StardictService.check(self.dict_path):
2018-07-09 14:12:54 +08:00
self.builder = self._get_builer(
dict_path,
service_wrap(StardictBuilder, dict_path, in_memory=False)
)
2018-07-10 23:07:07 +08:00
if self.builder:
self.builder.get_header()
2018-07-01 10:55:30 +08:00
2018-07-06 22:37:00 +08:00
@staticmethod
def check(dict_path):
return os.path.isfile(dict_path) and dict_path.lower().endswith('.ifo')
@property
def support(self):
2018-07-10 23:07:07 +08:00
return self.builder and StardictService.check(self.dict_path)
2018-07-01 10:55:30 +08:00
@property
def title(self):
if config.use_filename or not self.builder.ifo.bookname:
return self._filename
else:
return self.builder.ifo.bookname.decode('utf-8')
2018-07-13 22:08:35 +08:00
@export([u'默认', u'Default'])
2018-07-01 10:55:30 +08:00
def fld_whole(self):
#self.builder.check_build()
2018-07-01 10:55:30 +08:00
try:
result = self.builder[self.word]
result = result.strip().replace('\r\n', '<br />')\
.replace('\r', '<br />').replace('\n', '<br />')
return QueryResult(result=result)
except KeyError:
return QueryResult.default()
class QueryResult(MapDict):
"""Query Result structure"""
def __init__(self, *args, **kwargs):
super(QueryResult, self).__init__(*args, **kwargs)
# avoid return None
if self['result'] == None:
self['result'] = ""
def set_styles(self, **kwargs):
for key, value in kwargs.items():
self[key] = value
@classmethod
def default(cls):
return QueryResult(result="")