add baidu hanyu. fix #6

2018-07-07 22:51:01 +08:00 · 2018-07-07 22:51:01 +08:00 · 23fbfbd242
commit 23fbfbd242
parent eee7fde2b8
2 changed files with 226 additions and 0 deletions
--- a/src/fastwq/service/baidu_chinese.py
+++ b/src/fastwq/service/baidu_chinese.py
@ -0,0 +1,93 @@
+#-*- coding:utf-8 -*-
+from hashlib import sha1
+from .base import WebService, export, register, with_styles, parseHtml
+
+baidu_download_mp3 = True
+
+@register([u'百度汉语', u'Baidu Chinese'])
+class Baidu_Chinese(WebService):
+
+    def __init__(self):
+        super(Baidu_Chinese, self).__init__()
+
+    def _get_content(self, lang='eng'):
+        url = u"https://hanyu.baidu.com/s?wd={word}".format(word=self.word)
+        html = self.get_response(url, timeout=10)
+        soup = parseHtml(html)
+        result = {
+            'pinyin': '',
+            'basicmean': '',
+            'syn_ant': '',
+            'fanyi': '',
+            'audio_url': '',
+        }
+
+        #拼音
+        element = soup.find('div', id='pinyin')
+        if element:
+            tag = element.find_all('b')
+            if tag:
+                result['pinyin'] = u' '.join(x.get_text() for x in tag)
+            if tag:
+                tag = element.find('a')
+                result['audio_url'] = tag.get('url')
+
+        #基本释义
+        element = soup.find('div', id='basicmean-wrapper')
+        if element:
+            tag = element.find_all('p')
+            if tag:
+                result['basicmean'] = u'<br>'.join(x.get_text().strip() for x in tag)
+
+        #英文翻译
+        element = soup.find('div', id='fanyi-wrapper')
+        if element:
+            tag = element.find_all('dt')
+            if tag:
+                result['fanyi'] = u'<br>'.join(x.get_text().strip() for x in tag)
+
+        return self.cache_this(result)
+
+    def _get_field(self, key, default=u''):
+        return self.cache_result(key) if self.cached(key) else self._get_content().get(key, default)
+
+    @export([u'拼音', u'Phoneticize'], 1)
+    def fld_pinyin(self):
+        return self._get_field('pinyin')
+
+    @export('PRON', 2)
+    def fld_pron(self):
+        audio_url = self._get_field('audio_url')
+        if baidu_download_mp3 and audio_url:
+            filename = u'_baidu_chinese_{}_.mp3'.format(self.word)
+            hex_digest = sha1(
+                self.word.encode('utf-8') if isinstance(self.word, unicode)
+                else self.word
+            ).hexdigest().lower()
+            assert len(hex_digest) == 40, "unexpected output from hash library"
+            filename = '.'.join([
+                '-'.join([
+                    self.unique.lower(
+                    ), hex_digest[:8], hex_digest[8:16],
+                    hex_digest[16:24], hex_digest[24:32], hex_digest[32:],
+                ]),
+                'mp3',
+            ])
+            try:
+                self.net_download(
+                    filename,
+                    audio_url,
+                    require=dict(mime='audio/mp3', size=512),
+                )
+                return self.get_anki_label(filename, 'audio')
+            except:
+                pass
+        return ''
+
+    @export([u'基本释义', u'Basic Definitions'], 3)
+    def fld_basic(self):
+        return self._get_field('basicmean')
+
+    @export([u'英文翻译', u'Translation[En]'], 5)
+    def fld_fanyi(self):
+        return self._get_field('fanyi')
--- a/src/fastwq/service/base.py
+++ b/src/fastwq/service/base.py
@ -256,6 +256,139 @@ class WebService(Service):
        except Exception as e:
            pass

+    class TinyDownloadError(ValueError):
+        """Raises when a download is too small."""
+
+    def net_stream(self, targets, require=None, method='GET',
+                   awesome_ua=False, add_padding=False,
+                   custom_quoter=None, custom_headers=None):
+        """
+        Returns the raw payload string from the specified target(s).
+        If multiple targets are specified, their resulting payloads are
+        glued together.
+
+        Each "target" is a bare URL string or a tuple containing an
+        address and a dict for what to tack onto the query string.
+
+        Finally, a require dict may be passed to enforce a Content-Type
+        using key 'mime' and/or a minimum payload size using key 'size'.
+        If using multiple targets, these requirements apply to each
+        response.
+
+        The underlying library here already understands how to search
+        the environment for proxy settings (e.g. HTTP_PROXY), so we do
+        not need to do anything extra for that.
+
+        If add_padding is True, then some additional null padding will
+        be added onto the stream returned. This is helpful for some web
+        services that sometimes return MP3s that `mplayer` clips early.
+        """
+        DEFAULT_UA = 'Mozilla/5.0'
+        DEFAULT_TIMEOUT = 3
+
+        PADDING = '\0' * 2**11
+
+        assert method in ['GET', 'POST'], "method must be GET or POST"
+        from urllib2 import urlopen, Request, quote
+
+        targets = targets if isinstance(targets, list) else [targets]
+        targets = [
+            (target, None) if isinstance(target, basestring)
+            else (
+                target[0],
+                '&'.join(
+                    '='.join([
+                        key,
+                        (
+                            custom_quoter[key] if (custom_quoter and
+                                                   key in custom_quoter)
+                            else quote
+                        )(
+                            val.encode('utf-8') if isinstance(val, unicode)
+                            else val if isinstance(val, str)
+                            else str(val),
+                            safe='',
+                        ),
+                    ])
+                    for key, val in target[1].items()
+                ),
+            )
+            for target in targets
+        ]
+
+        require = require or {}
+
+        payloads = []
+
+        for number, (url, params) in enumerate(targets, 1):
+            desc = "web request" if len(targets) == 1 \
+                else "web request (%d of %d)" % (number, len(targets))
+
+            headers = {'User-Agent': DEFAULT_UA}
+            if custom_headers:
+                headers.update(custom_headers)
+            
+            response = urlopen(
+                Request(
+                    url=('?'.join([url, params]) if params and method == 'GET'
+                         else url),
+                    headers=headers,
+                ),
+                data=params if params and method == 'POST' else None,
+                timeout=DEFAULT_TIMEOUT,
+            )
+
+            if not response:
+                raise IOError("No response for %s" % desc)
+
+            if response.getcode() != 200:
+                value_error = ValueError(
+                    "Got %d status for %s" %
+                    (response.getcode(), desc)
+                )
+                try:
+                    value_error.payload = response.read()
+                    response.close()
+                except StandardError:
+                    pass
+                raise value_error
+
+            if 'mime' in require and \
+                    require['mime'] != format(response.info().
+                                              gettype()).replace('/x-', '/'):
+                value_error = ValueError(
+                    "Request got %s Content-Type for %s; wanted %s" %
+                    (response.info().gettype(), desc, require['mime'])
+                )
+                value_error.got_mime = response.info().gettype()
+                value_error.wanted_mime = require['mime']
+                raise value_error
+
+            payload = response.read()
+            response.close()
+
+            if 'size' in require and len(payload) < require['size']:
+                raise self.TinyDownloadError(
+                    "Request got %d-byte stream for %s; wanted %d+ bytes" %
+                    (len(payload), desc, require['size'])
+                )
+
+            payloads.append(payload)
+
+        if add_padding:
+            payloads.append(PADDING)
+        return ''.join(payloads)
+
+    def net_download(self, path, *args, **kwargs):
+        """
+        Downloads a file to the given path from the specified target(s).
+        See net_stream() for information about available options.
+        """
+
+        payload = self.net_stream(*args, **kwargs)
+        with open(path, 'wb') as response_output:
+            response_output.write(payload)
+

 class LocalService(Service):
    """