add baidu hanyu. fix #6
This commit is contained in:
parent
eee7fde2b8
commit
23fbfbd242
93
src/fastwq/service/baidu_chinese.py
Normal file
93
src/fastwq/service/baidu_chinese.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
#-*- coding:utf-8 -*-
|
||||||
|
from hashlib import sha1
|
||||||
|
from .base import WebService, export, register, with_styles, parseHtml
|
||||||
|
|
||||||
|
baidu_download_mp3 = True
|
||||||
|
|
||||||
|
@register([u'百度汉语', u'Baidu Chinese'])
|
||||||
|
class Baidu_Chinese(WebService):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(Baidu_Chinese, self).__init__()
|
||||||
|
|
||||||
|
def _get_content(self, lang='eng'):
|
||||||
|
url = u"https://hanyu.baidu.com/s?wd={word}".format(word=self.word)
|
||||||
|
html = self.get_response(url, timeout=10)
|
||||||
|
soup = parseHtml(html)
|
||||||
|
result = {
|
||||||
|
'pinyin': '',
|
||||||
|
'basicmean': '',
|
||||||
|
'syn_ant': '',
|
||||||
|
'fanyi': '',
|
||||||
|
'audio_url': '',
|
||||||
|
}
|
||||||
|
|
||||||
|
#拼音
|
||||||
|
element = soup.find('div', id='pinyin')
|
||||||
|
if element:
|
||||||
|
tag = element.find_all('b')
|
||||||
|
if tag:
|
||||||
|
result['pinyin'] = u' '.join(x.get_text() for x in tag)
|
||||||
|
if tag:
|
||||||
|
tag = element.find('a')
|
||||||
|
result['audio_url'] = tag.get('url')
|
||||||
|
|
||||||
|
#基本释义
|
||||||
|
element = soup.find('div', id='basicmean-wrapper')
|
||||||
|
if element:
|
||||||
|
tag = element.find_all('p')
|
||||||
|
if tag:
|
||||||
|
result['basicmean'] = u'<br>'.join(x.get_text().strip() for x in tag)
|
||||||
|
|
||||||
|
#英文翻译
|
||||||
|
element = soup.find('div', id='fanyi-wrapper')
|
||||||
|
if element:
|
||||||
|
tag = element.find_all('dt')
|
||||||
|
if tag:
|
||||||
|
result['fanyi'] = u'<br>'.join(x.get_text().strip() for x in tag)
|
||||||
|
|
||||||
|
return self.cache_this(result)
|
||||||
|
|
||||||
|
def _get_field(self, key, default=u''):
|
||||||
|
return self.cache_result(key) if self.cached(key) else self._get_content().get(key, default)
|
||||||
|
|
||||||
|
@export([u'拼音', u'Phoneticize'], 1)
|
||||||
|
def fld_pinyin(self):
|
||||||
|
return self._get_field('pinyin')
|
||||||
|
|
||||||
|
@export('PRON', 2)
|
||||||
|
def fld_pron(self):
|
||||||
|
audio_url = self._get_field('audio_url')
|
||||||
|
if baidu_download_mp3 and audio_url:
|
||||||
|
filename = u'_baidu_chinese_{}_.mp3'.format(self.word)
|
||||||
|
hex_digest = sha1(
|
||||||
|
self.word.encode('utf-8') if isinstance(self.word, unicode)
|
||||||
|
else self.word
|
||||||
|
).hexdigest().lower()
|
||||||
|
assert len(hex_digest) == 40, "unexpected output from hash library"
|
||||||
|
filename = '.'.join([
|
||||||
|
'-'.join([
|
||||||
|
self.unique.lower(
|
||||||
|
), hex_digest[:8], hex_digest[8:16],
|
||||||
|
hex_digest[16:24], hex_digest[24:32], hex_digest[32:],
|
||||||
|
]),
|
||||||
|
'mp3',
|
||||||
|
])
|
||||||
|
try:
|
||||||
|
self.net_download(
|
||||||
|
filename,
|
||||||
|
audio_url,
|
||||||
|
require=dict(mime='audio/mp3', size=512),
|
||||||
|
)
|
||||||
|
return self.get_anki_label(filename, 'audio')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return ''
|
||||||
|
|
||||||
|
@export([u'基本释义', u'Basic Definitions'], 3)
|
||||||
|
def fld_basic(self):
|
||||||
|
return self._get_field('basicmean')
|
||||||
|
|
||||||
|
@export([u'英文翻译', u'Translation[En]'], 5)
|
||||||
|
def fld_fanyi(self):
|
||||||
|
return self._get_field('fanyi')
|
||||||
@ -256,6 +256,139 @@ class WebService(Service):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class TinyDownloadError(ValueError):
|
||||||
|
"""Raises when a download is too small."""
|
||||||
|
|
||||||
|
def net_stream(self, targets, require=None, method='GET',
|
||||||
|
awesome_ua=False, add_padding=False,
|
||||||
|
custom_quoter=None, custom_headers=None):
|
||||||
|
"""
|
||||||
|
Returns the raw payload string from the specified target(s).
|
||||||
|
If multiple targets are specified, their resulting payloads are
|
||||||
|
glued together.
|
||||||
|
|
||||||
|
Each "target" is a bare URL string or a tuple containing an
|
||||||
|
address and a dict for what to tack onto the query string.
|
||||||
|
|
||||||
|
Finally, a require dict may be passed to enforce a Content-Type
|
||||||
|
using key 'mime' and/or a minimum payload size using key 'size'.
|
||||||
|
If using multiple targets, these requirements apply to each
|
||||||
|
response.
|
||||||
|
|
||||||
|
The underlying library here already understands how to search
|
||||||
|
the environment for proxy settings (e.g. HTTP_PROXY), so we do
|
||||||
|
not need to do anything extra for that.
|
||||||
|
|
||||||
|
If add_padding is True, then some additional null padding will
|
||||||
|
be added onto the stream returned. This is helpful for some web
|
||||||
|
services that sometimes return MP3s that `mplayer` clips early.
|
||||||
|
"""
|
||||||
|
DEFAULT_UA = 'Mozilla/5.0'
|
||||||
|
DEFAULT_TIMEOUT = 3
|
||||||
|
|
||||||
|
PADDING = '\0' * 2**11
|
||||||
|
|
||||||
|
assert method in ['GET', 'POST'], "method must be GET or POST"
|
||||||
|
from urllib2 import urlopen, Request, quote
|
||||||
|
|
||||||
|
targets = targets if isinstance(targets, list) else [targets]
|
||||||
|
targets = [
|
||||||
|
(target, None) if isinstance(target, basestring)
|
||||||
|
else (
|
||||||
|
target[0],
|
||||||
|
'&'.join(
|
||||||
|
'='.join([
|
||||||
|
key,
|
||||||
|
(
|
||||||
|
custom_quoter[key] if (custom_quoter and
|
||||||
|
key in custom_quoter)
|
||||||
|
else quote
|
||||||
|
)(
|
||||||
|
val.encode('utf-8') if isinstance(val, unicode)
|
||||||
|
else val if isinstance(val, str)
|
||||||
|
else str(val),
|
||||||
|
safe='',
|
||||||
|
),
|
||||||
|
])
|
||||||
|
for key, val in target[1].items()
|
||||||
|
),
|
||||||
|
)
|
||||||
|
for target in targets
|
||||||
|
]
|
||||||
|
|
||||||
|
require = require or {}
|
||||||
|
|
||||||
|
payloads = []
|
||||||
|
|
||||||
|
for number, (url, params) in enumerate(targets, 1):
|
||||||
|
desc = "web request" if len(targets) == 1 \
|
||||||
|
else "web request (%d of %d)" % (number, len(targets))
|
||||||
|
|
||||||
|
headers = {'User-Agent': DEFAULT_UA}
|
||||||
|
if custom_headers:
|
||||||
|
headers.update(custom_headers)
|
||||||
|
|
||||||
|
response = urlopen(
|
||||||
|
Request(
|
||||||
|
url=('?'.join([url, params]) if params and method == 'GET'
|
||||||
|
else url),
|
||||||
|
headers=headers,
|
||||||
|
),
|
||||||
|
data=params if params and method == 'POST' else None,
|
||||||
|
timeout=DEFAULT_TIMEOUT,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response:
|
||||||
|
raise IOError("No response for %s" % desc)
|
||||||
|
|
||||||
|
if response.getcode() != 200:
|
||||||
|
value_error = ValueError(
|
||||||
|
"Got %d status for %s" %
|
||||||
|
(response.getcode(), desc)
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
value_error.payload = response.read()
|
||||||
|
response.close()
|
||||||
|
except StandardError:
|
||||||
|
pass
|
||||||
|
raise value_error
|
||||||
|
|
||||||
|
if 'mime' in require and \
|
||||||
|
require['mime'] != format(response.info().
|
||||||
|
gettype()).replace('/x-', '/'):
|
||||||
|
value_error = ValueError(
|
||||||
|
"Request got %s Content-Type for %s; wanted %s" %
|
||||||
|
(response.info().gettype(), desc, require['mime'])
|
||||||
|
)
|
||||||
|
value_error.got_mime = response.info().gettype()
|
||||||
|
value_error.wanted_mime = require['mime']
|
||||||
|
raise value_error
|
||||||
|
|
||||||
|
payload = response.read()
|
||||||
|
response.close()
|
||||||
|
|
||||||
|
if 'size' in require and len(payload) < require['size']:
|
||||||
|
raise self.TinyDownloadError(
|
||||||
|
"Request got %d-byte stream for %s; wanted %d+ bytes" %
|
||||||
|
(len(payload), desc, require['size'])
|
||||||
|
)
|
||||||
|
|
||||||
|
payloads.append(payload)
|
||||||
|
|
||||||
|
if add_padding:
|
||||||
|
payloads.append(PADDING)
|
||||||
|
return ''.join(payloads)
|
||||||
|
|
||||||
|
def net_download(self, path, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Downloads a file to the given path from the specified target(s).
|
||||||
|
See net_stream() for information about available options.
|
||||||
|
"""
|
||||||
|
|
||||||
|
payload = self.net_stream(*args, **kwargs)
|
||||||
|
with open(path, 'wb') as response_output:
|
||||||
|
response_output.write(payload)
|
||||||
|
|
||||||
|
|
||||||
class LocalService(Service):
|
class LocalService(Service):
|
||||||
"""
|
"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user