This commit is contained in:
St.Huang 2018-07-27 14:51:18 +08:00
parent 634da45544
commit 7804aaf307
2 changed files with 129 additions and 128 deletions

View File

@ -1,165 +1,161 @@
# -*- coding: utf-8 -*- #-*- coding:utf-8 -*-
# Copyright: khuang6 <upday7@163.com>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
""" from hashlib import sha1
Project : wq from ..base import WebService, export, register, with_styles, parse_html
Created: 12/20/2017 from ...libs.bs4 import Tag
"""
import os
from warnings import filterwarnings
from ...libs.bs4 import BeautifulSoup, Tag
from ..base import WebService, export, register, with_styles
filterwarnings('ignore')
import sys
reload(sys)
sys.setdefaultencoding('utf8')
@register(u'朗文') longman_download_mp3 = True
@register([u'朗文', u'Longman'])
class Longman(WebService): class Longman(WebService):
def __init__(self): def __init__(self):
super(Longman, self).__init__() super(Longman, self).__init__()
def _get_singledict(self, single_dict): def _get_field(self, key, default=u''):
""" return self.cache_result(key) if self.cached(key) else self._get_content().get(key, default)
:type word: str def _get_content(self):
:return: url = 'https://www.ldoceonline.com/dictionary/{}'.format(self.word)
""" data = self.get_response(url)
soup = parse_html(data)
# Top Container
dictlinks = soup.find_all('span', {'class': 'dictlink'})
body_html = ""
word_info = {}
head_finded = False
for dic_link in dictlinks:
assert isinstance(dic_link, Tag)
if not (self.cached(single_dict) and self.cache_result(single_dict)): # remove sound tag
rsp = rq.get("https://www.ldoceonline.com/dictionary/{}".format(self.word), headers={ am_s_tag = dic_link.find('span', title='Play American pronunciation of {}'.format(self.word))
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36' br_s_tag = dic_link.find('span', title='Play British pronunciation of {}'.format(self.word))
}) if am_s_tag:
word_info['am_mp3'] = am_s_tag.get('data-src-mp3', u'')
am_s_tag.decompose()
if br_s_tag:
word_info['br_mp3'] = br_s_tag.get('data-src-mp3', u'')
br_s_tag.decompose()
if rsp.status_code == 200: # Remove related Topics Container
bs = BeautifulSoup(rsp.content.decode('utf-8'), 'html.parser', from_encoding="utf-8") related_topic_tag = dic_link.find('div', {'class': "topics_container"})
# Top Container if related_topic_tag:
dictlinks = bs.find_all('span', {'class': 'dictlink'}) related_topic_tag.decompose()
body_html = ""
word_info = { # Remove Tail
} tail_tag = dic_link.find("span", {'class': 'Tail'})
ee_ = '' if tail_tag:
for dic_link in dictlinks: tail_tag.decompose()
assert isinstance(dic_link, Tag)
# Remove related Topics Container # Remove SubEntry
related_topic_tag = dic_link.find('div', {'class': "topics_container"}) sub_entries = dic_link.find_all('span', {'class': 'SubEntry'})
if related_topic_tag: for sub_entry in sub_entries:
related_topic_tag.decompose() sub_entry.decompose()
# Remove Tail # word elements
tail_tag = dic_link.find("span", {'class': 'Tail'}) head_tag = dic_link.find('span', {'class': "Head"})
if tail_tag: if head_tag and not head_finded:
tail_tag.decompose() try:
hyphenation = head_tag.find("span", {'class': 'HYPHENATION'}).string # Hyphenation
except:
hyphenation = u''
try:
pron_codes = u''.join(
list(head_tag.find("span", {'class': 'PronCodes'}).strings)) # Hyphenation
except:
pron_codes = u''
try:
POS = head_tag.find("span", {'class': 'POS'}).string # Hyphenation
except:
POS = u''
# Remove SubEntry try:
sub_entries = dic_link.find_all('span', {'class': 'SubEntry'}) Inflections = head_tag.find('span', {'class': 'Inflections'})
for sub_entry in sub_entries: if Inflections:
sub_entry.decompose() Inflections = str(Inflections)
else:
Inflections = u''
except:
Inflections = u''
# word elements word_info['phonetic'] = pron_codes
head_tag = dic_link.find('span', {'class': "Head"}) word_info['hyphenation'] = hyphenation
if head_tag and not word_info: word_info['pos'] = POS
try: word_info['inflections'] = Inflections
hyphenation = head_tag.find("span", {'class': 'HYPHENATION'}).string # Hyphenation head_finded = True
except: #self.cache_this(word_info)
hyphenation = '' if head_tag:
try: head_tag.decompose()
pron_codes = "".join(
list(head_tag.find("span", {'class': 'PronCodes'}).strings)) # Hyphenation
except:
pron_codes = ''
try:
POS = head_tag.find("span", {'class': 'POS'}).string # Hyphenation
except:
POS = ''
try: # remove script tag
Inflections = head_tag.find('span', {'class': 'Inflections'}) script_tags = dic_link.find_all('script')
if Inflections: for t in script_tags:
Inflections = str(Inflections) t.decompose()
else:
Inflections = ''
except:
Inflections = ''
word_info = { # remove img tag
'phonetic': pron_codes, img_tags = dic_link.find_all('img')
'hyphenation': hyphenation, for t in img_tags:
'pos': POS, t.decompose()
'inflections': Inflections,
}
self.cache_this(word_info)
if head_tag:
head_tag.decompose()
# remove script tag # remove example sound tag
script_tags = dic_link.find_all('script') emp_s_tags = dic_link.find_all('span', {'class': 'speaker exafile fa fa-volume-up'})
for t in script_tags: for t in emp_s_tags:
t.decompose() t.decompose()
# remove img tag body_html += str(dic_link)
img_tags = dic_link.find_all('img')
for t in img_tags:
self.cache_this({'img': 'https://www.ldoceonline.com' + t['src']})
t.decompose()
# remove sound tag word_info['ee'] = body_html
am_s_tag = dic_link.find("span", title='Play American pronunciation of {}'.format(self.word)) return self.cache_this(word_info)
br_s_tag = dic_link.find("span", title='Play British pronunciation of {}'.format(self.word))
if am_s_tag:
am_s_tag.decompose()
if br_s_tag:
br_s_tag.decompose()
# remove example sound tag
emp_s_tags = dic_link.find_all('span', {'class': 'speaker exafile fa fa-volume-up'})
for t in emp_s_tags:
t.decompose()
body_html += str(dic_link)
ee_ = body_html
self.cache_this({
'ee': ee_
})
else:
return ''
return self.cache_result(single_dict)
@export(u'音标') @export(u'音标')
def fld_phonetic(self): def fld_phonetic(self):
return self._get_singledict('phonetic') return self._get_field('phonetic')
def _fld_mp3(self, fld):
audio_url = self._get_field(fld)
if longman_download_mp3 and audio_url:
filename = u'_longman_{}_.mp3'.format(self.word)
hex_digest = sha1(
self.word.encode('utf-8') if isinstance(self.word, unicode)
else self.word
).hexdigest().lower()
assert len(hex_digest) == 40, "unexpected output from hash library"
filename = '.'.join([
'-'.join([
self.unique.lower(
), hex_digest[:8], hex_digest[8:16],
hex_digest[16:24], hex_digest[24:32], hex_digest[32:],
]),
'mp3',
])
if self.net_download(filename, audio_url):
return self.get_anki_label(filename, 'audio')
return ''
@export(u'美音')
def fld_mp3_us(self):
return self._fld_mp3('am_mp3')
@export(u'英音')
def fld_mp3_uk(self):
return self._fld_mp3('br_mp3')
@export(u'断字单词') @export(u'断字单词')
def fld_hyphenation(self): def fld_hyphenation(self):
return self._get_singledict('hyphenation') return self._get_field('hyphenation')
@export(u'词性') @export(u'词性')
def fld_pos(self): def fld_pos(self):
return self._get_singledict('pos') return self._get_field('pos')
@export(u'英英解释') @export(u'英英解释')
@with_styles(cssfile='_longman.css') @with_styles(cssfile='_longman.css')
def fld_ee(self): def fld_ee(self):
return self._get_singledict('ee') return self._get_field('ee')
@export(u'图片')
def fld_pic(self):
url = self._get_singledict('img')
filename = u'longman_img_{}'.format(os.path.basename(url))
if url and self.download(url, filename):
return self.get_anki_label(filename, 'img')
return ''
@export(u'变形') @export(u'变形')
@with_styles(cssfile='_longman.css') @with_styles(cssfile='_longman.css')
def fld_inflections(self): def fld_inflections(self):
return self._get_singledict('inflections') return self._get_field('inflections')

View File

@ -17,6 +17,7 @@
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
import sys
import inspect import inspect
import os import os
from hashlib import md5 from hashlib import md5
@ -26,6 +27,10 @@ from ..context import config
from ..utils import importlib from ..utils import importlib
reload(sys)
sys.setdefaultencoding('utf8')
class ServiceManager(object): class ServiceManager(object):
""" """
Query service class manager Query service class manager