add baidu hanyu. fix #6

This commit is contained in:
St.Huang 2018-07-07 22:51:01 +08:00
parent eee7fde2b8
commit 23fbfbd242
2 changed files with 226 additions and 0 deletions

View File

@ -0,0 +1,93 @@
#-*- coding:utf-8 -*-
from hashlib import sha1
from .base import WebService, export, register, with_styles, parseHtml
baidu_download_mp3 = True
@register([u'百度汉语', u'Baidu Chinese'])
class Baidu_Chinese(WebService):
def __init__(self):
super(Baidu_Chinese, self).__init__()
def _get_content(self, lang='eng'):
url = u"https://hanyu.baidu.com/s?wd={word}".format(word=self.word)
html = self.get_response(url, timeout=10)
soup = parseHtml(html)
result = {
'pinyin': '',
'basicmean': '',
'syn_ant': '',
'fanyi': '',
'audio_url': '',
}
#拼音
element = soup.find('div', id='pinyin')
if element:
tag = element.find_all('b')
if tag:
result['pinyin'] = u' '.join(x.get_text() for x in tag)
if tag:
tag = element.find('a')
result['audio_url'] = tag.get('url')
#基本释义
element = soup.find('div', id='basicmean-wrapper')
if element:
tag = element.find_all('p')
if tag:
result['basicmean'] = u'<br>'.join(x.get_text().strip() for x in tag)
#英文翻译
element = soup.find('div', id='fanyi-wrapper')
if element:
tag = element.find_all('dt')
if tag:
result['fanyi'] = u'<br>'.join(x.get_text().strip() for x in tag)
return self.cache_this(result)
def _get_field(self, key, default=u''):
return self.cache_result(key) if self.cached(key) else self._get_content().get(key, default)
@export([u'拼音', u'Phoneticize'], 1)
def fld_pinyin(self):
return self._get_field('pinyin')
@export('PRON', 2)
def fld_pron(self):
audio_url = self._get_field('audio_url')
if baidu_download_mp3 and audio_url:
filename = u'_baidu_chinese_{}_.mp3'.format(self.word)
hex_digest = sha1(
self.word.encode('utf-8') if isinstance(self.word, unicode)
else self.word
).hexdigest().lower()
assert len(hex_digest) == 40, "unexpected output from hash library"
filename = '.'.join([
'-'.join([
self.unique.lower(
), hex_digest[:8], hex_digest[8:16],
hex_digest[16:24], hex_digest[24:32], hex_digest[32:],
]),
'mp3',
])
try:
self.net_download(
filename,
audio_url,
require=dict(mime='audio/mp3', size=512),
)
return self.get_anki_label(filename, 'audio')
except:
pass
return ''
@export([u'基本释义', u'Basic Definitions'], 3)
def fld_basic(self):
return self._get_field('basicmean')
@export([u'英文翻译', u'Translation[En]'], 5)
def fld_fanyi(self):
return self._get_field('fanyi')

View File

@ -256,6 +256,139 @@ class WebService(Service):
except Exception as e:
pass
class TinyDownloadError(ValueError):
"""Raises when a download is too small."""
def net_stream(self, targets, require=None, method='GET',
awesome_ua=False, add_padding=False,
custom_quoter=None, custom_headers=None):
"""
Returns the raw payload string from the specified target(s).
If multiple targets are specified, their resulting payloads are
glued together.
Each "target" is a bare URL string or a tuple containing an
address and a dict for what to tack onto the query string.
Finally, a require dict may be passed to enforce a Content-Type
using key 'mime' and/or a minimum payload size using key 'size'.
If using multiple targets, these requirements apply to each
response.
The underlying library here already understands how to search
the environment for proxy settings (e.g. HTTP_PROXY), so we do
not need to do anything extra for that.
If add_padding is True, then some additional null padding will
be added onto the stream returned. This is helpful for some web
services that sometimes return MP3s that `mplayer` clips early.
"""
DEFAULT_UA = 'Mozilla/5.0'
DEFAULT_TIMEOUT = 3
PADDING = '\0' * 2**11
assert method in ['GET', 'POST'], "method must be GET or POST"
from urllib2 import urlopen, Request, quote
targets = targets if isinstance(targets, list) else [targets]
targets = [
(target, None) if isinstance(target, basestring)
else (
target[0],
'&'.join(
'='.join([
key,
(
custom_quoter[key] if (custom_quoter and
key in custom_quoter)
else quote
)(
val.encode('utf-8') if isinstance(val, unicode)
else val if isinstance(val, str)
else str(val),
safe='',
),
])
for key, val in target[1].items()
),
)
for target in targets
]
require = require or {}
payloads = []
for number, (url, params) in enumerate(targets, 1):
desc = "web request" if len(targets) == 1 \
else "web request (%d of %d)" % (number, len(targets))
headers = {'User-Agent': DEFAULT_UA}
if custom_headers:
headers.update(custom_headers)
response = urlopen(
Request(
url=('?'.join([url, params]) if params and method == 'GET'
else url),
headers=headers,
),
data=params if params and method == 'POST' else None,
timeout=DEFAULT_TIMEOUT,
)
if not response:
raise IOError("No response for %s" % desc)
if response.getcode() != 200:
value_error = ValueError(
"Got %d status for %s" %
(response.getcode(), desc)
)
try:
value_error.payload = response.read()
response.close()
except StandardError:
pass
raise value_error
if 'mime' in require and \
require['mime'] != format(response.info().
gettype()).replace('/x-', '/'):
value_error = ValueError(
"Request got %s Content-Type for %s; wanted %s" %
(response.info().gettype(), desc, require['mime'])
)
value_error.got_mime = response.info().gettype()
value_error.wanted_mime = require['mime']
raise value_error
payload = response.read()
response.close()
if 'size' in require and len(payload) < require['size']:
raise self.TinyDownloadError(
"Request got %d-byte stream for %s; wanted %d+ bytes" %
(len(payload), desc, require['size'])
)
payloads.append(payload)
if add_padding:
payloads.append(PADDING)
return ''.join(payloads)
def net_download(self, path, *args, **kwargs):
"""
Downloads a file to the given path from the specified target(s).
See net_stream() for information about available options.
"""
payload = self.net_stream(*args, **kwargs)
with open(path, 'wb') as response_output:
response_output.write(payload)
class LocalService(Service):
"""