From 23fbfbd242a08d68524dd9f3f2a2332e5c195815 Mon Sep 17 00:00:00 2001 From: "St.Huang" Date: Sat, 7 Jul 2018 22:51:01 +0800 Subject: [PATCH] add baidu hanyu. fix #6 --- src/fastwq/service/baidu_chinese.py | 93 +++++++++++++++++++ src/fastwq/service/base.py | 133 ++++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 src/fastwq/service/baidu_chinese.py diff --git a/src/fastwq/service/baidu_chinese.py b/src/fastwq/service/baidu_chinese.py new file mode 100644 index 0000000..5d918c8 --- /dev/null +++ b/src/fastwq/service/baidu_chinese.py @@ -0,0 +1,93 @@ +#-*- coding:utf-8 -*- +from hashlib import sha1 +from .base import WebService, export, register, with_styles, parseHtml + +baidu_download_mp3 = True + +@register([u'百度汉语', u'Baidu Chinese']) +class Baidu_Chinese(WebService): + + def __init__(self): + super(Baidu_Chinese, self).__init__() + + def _get_content(self, lang='eng'): + url = u"https://hanyu.baidu.com/s?wd={word}".format(word=self.word) + html = self.get_response(url, timeout=10) + soup = parseHtml(html) + result = { + 'pinyin': '', + 'basicmean': '', + 'syn_ant': '', + 'fanyi': '', + 'audio_url': '', + } + + #拼音 + element = soup.find('div', id='pinyin') + if element: + tag = element.find_all('b') + if tag: + result['pinyin'] = u' '.join(x.get_text() for x in tag) + if tag: + tag = element.find('a') + result['audio_url'] = tag.get('url') + + #基本释义 + element = soup.find('div', id='basicmean-wrapper') + if element: + tag = element.find_all('p') + if tag: + result['basicmean'] = u'
'.join(x.get_text().strip() for x in tag) + + #英文翻译 + element = soup.find('div', id='fanyi-wrapper') + if element: + tag = element.find_all('dt') + if tag: + result['fanyi'] = u'
'.join(x.get_text().strip() for x in tag) + + return self.cache_this(result) + + def _get_field(self, key, default=u''): + return self.cache_result(key) if self.cached(key) else self._get_content().get(key, default) + + @export([u'拼音', u'Phoneticize'], 1) + def fld_pinyin(self): + return self._get_field('pinyin') + + @export('PRON', 2) + def fld_pron(self): + audio_url = self._get_field('audio_url') + if baidu_download_mp3 and audio_url: + filename = u'_baidu_chinese_{}_.mp3'.format(self.word) + hex_digest = sha1( + self.word.encode('utf-8') if isinstance(self.word, unicode) + else self.word + ).hexdigest().lower() + assert len(hex_digest) == 40, "unexpected output from hash library" + filename = '.'.join([ + '-'.join([ + self.unique.lower( + ), hex_digest[:8], hex_digest[8:16], + hex_digest[16:24], hex_digest[24:32], hex_digest[32:], + ]), + 'mp3', + ]) + try: + self.net_download( + filename, + audio_url, + require=dict(mime='audio/mp3', size=512), + ) + return self.get_anki_label(filename, 'audio') + except: + pass + return '' + + @export([u'基本释义', u'Basic Definitions'], 3) + def fld_basic(self): + return self._get_field('basicmean') + + @export([u'英文翻译', u'Translation[En]'], 5) + def fld_fanyi(self): + return self._get_field('fanyi') diff --git a/src/fastwq/service/base.py b/src/fastwq/service/base.py index 7d5bd9d..50df902 100644 --- a/src/fastwq/service/base.py +++ b/src/fastwq/service/base.py @@ -256,6 +256,139 @@ class WebService(Service): except Exception as e: pass + class TinyDownloadError(ValueError): + """Raises when a download is too small.""" + + def net_stream(self, targets, require=None, method='GET', + awesome_ua=False, add_padding=False, + custom_quoter=None, custom_headers=None): + """ + Returns the raw payload string from the specified target(s). + If multiple targets are specified, their resulting payloads are + glued together. + + Each "target" is a bare URL string or a tuple containing an + address and a dict for what to tack onto the query string. + + Finally, a require dict may be passed to enforce a Content-Type + using key 'mime' and/or a minimum payload size using key 'size'. + If using multiple targets, these requirements apply to each + response. + + The underlying library here already understands how to search + the environment for proxy settings (e.g. HTTP_PROXY), so we do + not need to do anything extra for that. + + If add_padding is True, then some additional null padding will + be added onto the stream returned. This is helpful for some web + services that sometimes return MP3s that `mplayer` clips early. + """ + DEFAULT_UA = 'Mozilla/5.0' + DEFAULT_TIMEOUT = 3 + + PADDING = '\0' * 2**11 + + assert method in ['GET', 'POST'], "method must be GET or POST" + from urllib2 import urlopen, Request, quote + + targets = targets if isinstance(targets, list) else [targets] + targets = [ + (target, None) if isinstance(target, basestring) + else ( + target[0], + '&'.join( + '='.join([ + key, + ( + custom_quoter[key] if (custom_quoter and + key in custom_quoter) + else quote + )( + val.encode('utf-8') if isinstance(val, unicode) + else val if isinstance(val, str) + else str(val), + safe='', + ), + ]) + for key, val in target[1].items() + ), + ) + for target in targets + ] + + require = require or {} + + payloads = [] + + for number, (url, params) in enumerate(targets, 1): + desc = "web request" if len(targets) == 1 \ + else "web request (%d of %d)" % (number, len(targets)) + + headers = {'User-Agent': DEFAULT_UA} + if custom_headers: + headers.update(custom_headers) + + response = urlopen( + Request( + url=('?'.join([url, params]) if params and method == 'GET' + else url), + headers=headers, + ), + data=params if params and method == 'POST' else None, + timeout=DEFAULT_TIMEOUT, + ) + + if not response: + raise IOError("No response for %s" % desc) + + if response.getcode() != 200: + value_error = ValueError( + "Got %d status for %s" % + (response.getcode(), desc) + ) + try: + value_error.payload = response.read() + response.close() + except StandardError: + pass + raise value_error + + if 'mime' in require and \ + require['mime'] != format(response.info(). + gettype()).replace('/x-', '/'): + value_error = ValueError( + "Request got %s Content-Type for %s; wanted %s" % + (response.info().gettype(), desc, require['mime']) + ) + value_error.got_mime = response.info().gettype() + value_error.wanted_mime = require['mime'] + raise value_error + + payload = response.read() + response.close() + + if 'size' in require and len(payload) < require['size']: + raise self.TinyDownloadError( + "Request got %d-byte stream for %s; wanted %d+ bytes" % + (len(payload), desc, require['size']) + ) + + payloads.append(payload) + + if add_padding: + payloads.append(PADDING) + return ''.join(payloads) + + def net_download(self, path, *args, **kwargs): + """ + Downloads a file to the given path from the specified target(s). + See net_stream() for information about available options. + """ + + payload = self.net_stream(*args, **kwargs) + with open(path, 'wb') as response_output: + response_output.write(payload) + class LocalService(Service): """