add baidu hanyu. fix #6
This commit is contained in:
parent
eee7fde2b8
commit
23fbfbd242
93
src/fastwq/service/baidu_chinese.py
Normal file
93
src/fastwq/service/baidu_chinese.py
Normal file
@ -0,0 +1,93 @@
|
||||
#-*- coding:utf-8 -*-
|
||||
from hashlib import sha1
|
||||
from .base import WebService, export, register, with_styles, parseHtml
|
||||
|
||||
baidu_download_mp3 = True
|
||||
|
||||
@register([u'百度汉语', u'Baidu Chinese'])
|
||||
class Baidu_Chinese(WebService):
|
||||
|
||||
def __init__(self):
|
||||
super(Baidu_Chinese, self).__init__()
|
||||
|
||||
def _get_content(self, lang='eng'):
|
||||
url = u"https://hanyu.baidu.com/s?wd={word}".format(word=self.word)
|
||||
html = self.get_response(url, timeout=10)
|
||||
soup = parseHtml(html)
|
||||
result = {
|
||||
'pinyin': '',
|
||||
'basicmean': '',
|
||||
'syn_ant': '',
|
||||
'fanyi': '',
|
||||
'audio_url': '',
|
||||
}
|
||||
|
||||
#拼音
|
||||
element = soup.find('div', id='pinyin')
|
||||
if element:
|
||||
tag = element.find_all('b')
|
||||
if tag:
|
||||
result['pinyin'] = u' '.join(x.get_text() for x in tag)
|
||||
if tag:
|
||||
tag = element.find('a')
|
||||
result['audio_url'] = tag.get('url')
|
||||
|
||||
#基本释义
|
||||
element = soup.find('div', id='basicmean-wrapper')
|
||||
if element:
|
||||
tag = element.find_all('p')
|
||||
if tag:
|
||||
result['basicmean'] = u'<br>'.join(x.get_text().strip() for x in tag)
|
||||
|
||||
#英文翻译
|
||||
element = soup.find('div', id='fanyi-wrapper')
|
||||
if element:
|
||||
tag = element.find_all('dt')
|
||||
if tag:
|
||||
result['fanyi'] = u'<br>'.join(x.get_text().strip() for x in tag)
|
||||
|
||||
return self.cache_this(result)
|
||||
|
||||
def _get_field(self, key, default=u''):
|
||||
return self.cache_result(key) if self.cached(key) else self._get_content().get(key, default)
|
||||
|
||||
@export([u'拼音', u'Phoneticize'], 1)
|
||||
def fld_pinyin(self):
|
||||
return self._get_field('pinyin')
|
||||
|
||||
@export('PRON', 2)
|
||||
def fld_pron(self):
|
||||
audio_url = self._get_field('audio_url')
|
||||
if baidu_download_mp3 and audio_url:
|
||||
filename = u'_baidu_chinese_{}_.mp3'.format(self.word)
|
||||
hex_digest = sha1(
|
||||
self.word.encode('utf-8') if isinstance(self.word, unicode)
|
||||
else self.word
|
||||
).hexdigest().lower()
|
||||
assert len(hex_digest) == 40, "unexpected output from hash library"
|
||||
filename = '.'.join([
|
||||
'-'.join([
|
||||
self.unique.lower(
|
||||
), hex_digest[:8], hex_digest[8:16],
|
||||
hex_digest[16:24], hex_digest[24:32], hex_digest[32:],
|
||||
]),
|
||||
'mp3',
|
||||
])
|
||||
try:
|
||||
self.net_download(
|
||||
filename,
|
||||
audio_url,
|
||||
require=dict(mime='audio/mp3', size=512),
|
||||
)
|
||||
return self.get_anki_label(filename, 'audio')
|
||||
except:
|
||||
pass
|
||||
return ''
|
||||
|
||||
@export([u'基本释义', u'Basic Definitions'], 3)
|
||||
def fld_basic(self):
|
||||
return self._get_field('basicmean')
|
||||
|
||||
@export([u'英文翻译', u'Translation[En]'], 5)
|
||||
def fld_fanyi(self):
|
||||
return self._get_field('fanyi')
|
||||
@ -256,6 +256,139 @@ class WebService(Service):
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
class TinyDownloadError(ValueError):
|
||||
"""Raises when a download is too small."""
|
||||
|
||||
def net_stream(self, targets, require=None, method='GET',
|
||||
awesome_ua=False, add_padding=False,
|
||||
custom_quoter=None, custom_headers=None):
|
||||
"""
|
||||
Returns the raw payload string from the specified target(s).
|
||||
If multiple targets are specified, their resulting payloads are
|
||||
glued together.
|
||||
|
||||
Each "target" is a bare URL string or a tuple containing an
|
||||
address and a dict for what to tack onto the query string.
|
||||
|
||||
Finally, a require dict may be passed to enforce a Content-Type
|
||||
using key 'mime' and/or a minimum payload size using key 'size'.
|
||||
If using multiple targets, these requirements apply to each
|
||||
response.
|
||||
|
||||
The underlying library here already understands how to search
|
||||
the environment for proxy settings (e.g. HTTP_PROXY), so we do
|
||||
not need to do anything extra for that.
|
||||
|
||||
If add_padding is True, then some additional null padding will
|
||||
be added onto the stream returned. This is helpful for some web
|
||||
services that sometimes return MP3s that `mplayer` clips early.
|
||||
"""
|
||||
DEFAULT_UA = 'Mozilla/5.0'
|
||||
DEFAULT_TIMEOUT = 3
|
||||
|
||||
PADDING = '\0' * 2**11
|
||||
|
||||
assert method in ['GET', 'POST'], "method must be GET or POST"
|
||||
from urllib2 import urlopen, Request, quote
|
||||
|
||||
targets = targets if isinstance(targets, list) else [targets]
|
||||
targets = [
|
||||
(target, None) if isinstance(target, basestring)
|
||||
else (
|
||||
target[0],
|
||||
'&'.join(
|
||||
'='.join([
|
||||
key,
|
||||
(
|
||||
custom_quoter[key] if (custom_quoter and
|
||||
key in custom_quoter)
|
||||
else quote
|
||||
)(
|
||||
val.encode('utf-8') if isinstance(val, unicode)
|
||||
else val if isinstance(val, str)
|
||||
else str(val),
|
||||
safe='',
|
||||
),
|
||||
])
|
||||
for key, val in target[1].items()
|
||||
),
|
||||
)
|
||||
for target in targets
|
||||
]
|
||||
|
||||
require = require or {}
|
||||
|
||||
payloads = []
|
||||
|
||||
for number, (url, params) in enumerate(targets, 1):
|
||||
desc = "web request" if len(targets) == 1 \
|
||||
else "web request (%d of %d)" % (number, len(targets))
|
||||
|
||||
headers = {'User-Agent': DEFAULT_UA}
|
||||
if custom_headers:
|
||||
headers.update(custom_headers)
|
||||
|
||||
response = urlopen(
|
||||
Request(
|
||||
url=('?'.join([url, params]) if params and method == 'GET'
|
||||
else url),
|
||||
headers=headers,
|
||||
),
|
||||
data=params if params and method == 'POST' else None,
|
||||
timeout=DEFAULT_TIMEOUT,
|
||||
)
|
||||
|
||||
if not response:
|
||||
raise IOError("No response for %s" % desc)
|
||||
|
||||
if response.getcode() != 200:
|
||||
value_error = ValueError(
|
||||
"Got %d status for %s" %
|
||||
(response.getcode(), desc)
|
||||
)
|
||||
try:
|
||||
value_error.payload = response.read()
|
||||
response.close()
|
||||
except StandardError:
|
||||
pass
|
||||
raise value_error
|
||||
|
||||
if 'mime' in require and \
|
||||
require['mime'] != format(response.info().
|
||||
gettype()).replace('/x-', '/'):
|
||||
value_error = ValueError(
|
||||
"Request got %s Content-Type for %s; wanted %s" %
|
||||
(response.info().gettype(), desc, require['mime'])
|
||||
)
|
||||
value_error.got_mime = response.info().gettype()
|
||||
value_error.wanted_mime = require['mime']
|
||||
raise value_error
|
||||
|
||||
payload = response.read()
|
||||
response.close()
|
||||
|
||||
if 'size' in require and len(payload) < require['size']:
|
||||
raise self.TinyDownloadError(
|
||||
"Request got %d-byte stream for %s; wanted %d+ bytes" %
|
||||
(len(payload), desc, require['size'])
|
||||
)
|
||||
|
||||
payloads.append(payload)
|
||||
|
||||
if add_padding:
|
||||
payloads.append(PADDING)
|
||||
return ''.join(payloads)
|
||||
|
||||
def net_download(self, path, *args, **kwargs):
|
||||
"""
|
||||
Downloads a file to the given path from the specified target(s).
|
||||
See net_stream() for information about available options.
|
||||
"""
|
||||
|
||||
payload = self.net_stream(*args, **kwargs)
|
||||
with open(path, 'wb') as response_output:
|
||||
response_output.write(payload)
|
||||
|
||||
|
||||
class LocalService(Service):
|
||||
"""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user