fix #24
This commit is contained in:
parent
634da45544
commit
7804aaf307
@ -1,165 +1,161 @@
|
|||||||
# -*- coding: utf-8 -*-
|
#-*- coding:utf-8 -*-
|
||||||
# Copyright: khuang6 <upday7@163.com>
|
|
||||||
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
|
||||||
|
|
||||||
"""
|
from hashlib import sha1
|
||||||
Project : wq
|
from ..base import WebService, export, register, with_styles, parse_html
|
||||||
Created: 12/20/2017
|
from ...libs.bs4 import Tag
|
||||||
"""
|
|
||||||
import os
|
|
||||||
from warnings import filterwarnings
|
|
||||||
from ...libs.bs4 import BeautifulSoup, Tag
|
|
||||||
|
|
||||||
from ..base import WebService, export, register, with_styles
|
|
||||||
|
|
||||||
filterwarnings('ignore')
|
|
||||||
import sys
|
|
||||||
|
|
||||||
reload(sys)
|
|
||||||
sys.setdefaultencoding('utf8')
|
|
||||||
|
|
||||||
|
|
||||||
@register(u'朗文')
|
longman_download_mp3 = True
|
||||||
|
|
||||||
|
|
||||||
|
@register([u'朗文', u'Longman'])
|
||||||
class Longman(WebService):
|
class Longman(WebService):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Longman, self).__init__()
|
super(Longman, self).__init__()
|
||||||
|
|
||||||
def _get_singledict(self, single_dict):
|
def _get_field(self, key, default=u''):
|
||||||
"""
|
return self.cache_result(key) if self.cached(key) else self._get_content().get(key, default)
|
||||||
|
|
||||||
:type word: str
|
def _get_content(self):
|
||||||
:return:
|
url = 'https://www.ldoceonline.com/dictionary/{}'.format(self.word)
|
||||||
"""
|
data = self.get_response(url)
|
||||||
|
soup = parse_html(data)
|
||||||
|
# Top Container
|
||||||
|
dictlinks = soup.find_all('span', {'class': 'dictlink'})
|
||||||
|
body_html = ""
|
||||||
|
word_info = {}
|
||||||
|
head_finded = False
|
||||||
|
for dic_link in dictlinks:
|
||||||
|
assert isinstance(dic_link, Tag)
|
||||||
|
|
||||||
if not (self.cached(single_dict) and self.cache_result(single_dict)):
|
# remove sound tag
|
||||||
rsp = rq.get("https://www.ldoceonline.com/dictionary/{}".format(self.word), headers={
|
am_s_tag = dic_link.find('span', title='Play American pronunciation of {}'.format(self.word))
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36'
|
br_s_tag = dic_link.find('span', title='Play British pronunciation of {}'.format(self.word))
|
||||||
})
|
if am_s_tag:
|
||||||
|
word_info['am_mp3'] = am_s_tag.get('data-src-mp3', u'')
|
||||||
|
am_s_tag.decompose()
|
||||||
|
if br_s_tag:
|
||||||
|
word_info['br_mp3'] = br_s_tag.get('data-src-mp3', u'')
|
||||||
|
br_s_tag.decompose()
|
||||||
|
|
||||||
if rsp.status_code == 200:
|
# Remove related Topics Container
|
||||||
bs = BeautifulSoup(rsp.content.decode('utf-8'), 'html.parser', from_encoding="utf-8")
|
related_topic_tag = dic_link.find('div', {'class': "topics_container"})
|
||||||
# Top Container
|
if related_topic_tag:
|
||||||
dictlinks = bs.find_all('span', {'class': 'dictlink'})
|
related_topic_tag.decompose()
|
||||||
body_html = ""
|
|
||||||
|
|
||||||
word_info = {
|
# Remove Tail
|
||||||
}
|
tail_tag = dic_link.find("span", {'class': 'Tail'})
|
||||||
ee_ = ''
|
if tail_tag:
|
||||||
for dic_link in dictlinks:
|
tail_tag.decompose()
|
||||||
assert isinstance(dic_link, Tag)
|
|
||||||
|
|
||||||
# Remove related Topics Container
|
# Remove SubEntry
|
||||||
related_topic_tag = dic_link.find('div', {'class': "topics_container"})
|
sub_entries = dic_link.find_all('span', {'class': 'SubEntry'})
|
||||||
if related_topic_tag:
|
for sub_entry in sub_entries:
|
||||||
related_topic_tag.decompose()
|
sub_entry.decompose()
|
||||||
|
|
||||||
# Remove Tail
|
# word elements
|
||||||
tail_tag = dic_link.find("span", {'class': 'Tail'})
|
head_tag = dic_link.find('span', {'class': "Head"})
|
||||||
if tail_tag:
|
if head_tag and not head_finded:
|
||||||
tail_tag.decompose()
|
try:
|
||||||
|
hyphenation = head_tag.find("span", {'class': 'HYPHENATION'}).string # Hyphenation
|
||||||
|
except:
|
||||||
|
hyphenation = u''
|
||||||
|
try:
|
||||||
|
pron_codes = u''.join(
|
||||||
|
list(head_tag.find("span", {'class': 'PronCodes'}).strings)) # Hyphenation
|
||||||
|
except:
|
||||||
|
pron_codes = u''
|
||||||
|
try:
|
||||||
|
POS = head_tag.find("span", {'class': 'POS'}).string # Hyphenation
|
||||||
|
except:
|
||||||
|
POS = u''
|
||||||
|
|
||||||
# Remove SubEntry
|
try:
|
||||||
sub_entries = dic_link.find_all('span', {'class': 'SubEntry'})
|
Inflections = head_tag.find('span', {'class': 'Inflections'})
|
||||||
for sub_entry in sub_entries:
|
if Inflections:
|
||||||
sub_entry.decompose()
|
Inflections = str(Inflections)
|
||||||
|
else:
|
||||||
|
Inflections = u''
|
||||||
|
except:
|
||||||
|
Inflections = u''
|
||||||
|
|
||||||
# word elements
|
word_info['phonetic'] = pron_codes
|
||||||
head_tag = dic_link.find('span', {'class': "Head"})
|
word_info['hyphenation'] = hyphenation
|
||||||
if head_tag and not word_info:
|
word_info['pos'] = POS
|
||||||
try:
|
word_info['inflections'] = Inflections
|
||||||
hyphenation = head_tag.find("span", {'class': 'HYPHENATION'}).string # Hyphenation
|
head_finded = True
|
||||||
except:
|
#self.cache_this(word_info)
|
||||||
hyphenation = ''
|
if head_tag:
|
||||||
try:
|
head_tag.decompose()
|
||||||
pron_codes = "".join(
|
|
||||||
list(head_tag.find("span", {'class': 'PronCodes'}).strings)) # Hyphenation
|
|
||||||
except:
|
|
||||||
pron_codes = ''
|
|
||||||
try:
|
|
||||||
POS = head_tag.find("span", {'class': 'POS'}).string # Hyphenation
|
|
||||||
except:
|
|
||||||
POS = ''
|
|
||||||
|
|
||||||
try:
|
# remove script tag
|
||||||
Inflections = head_tag.find('span', {'class': 'Inflections'})
|
script_tags = dic_link.find_all('script')
|
||||||
if Inflections:
|
for t in script_tags:
|
||||||
Inflections = str(Inflections)
|
t.decompose()
|
||||||
else:
|
|
||||||
Inflections = ''
|
|
||||||
except:
|
|
||||||
Inflections = ''
|
|
||||||
|
|
||||||
word_info = {
|
# remove img tag
|
||||||
'phonetic': pron_codes,
|
img_tags = dic_link.find_all('img')
|
||||||
'hyphenation': hyphenation,
|
for t in img_tags:
|
||||||
'pos': POS,
|
t.decompose()
|
||||||
'inflections': Inflections,
|
|
||||||
}
|
|
||||||
self.cache_this(word_info)
|
|
||||||
if head_tag:
|
|
||||||
head_tag.decompose()
|
|
||||||
|
|
||||||
# remove script tag
|
# remove example sound tag
|
||||||
script_tags = dic_link.find_all('script')
|
emp_s_tags = dic_link.find_all('span', {'class': 'speaker exafile fa fa-volume-up'})
|
||||||
for t in script_tags:
|
for t in emp_s_tags:
|
||||||
t.decompose()
|
t.decompose()
|
||||||
|
|
||||||
# remove img tag
|
body_html += str(dic_link)
|
||||||
img_tags = dic_link.find_all('img')
|
|
||||||
for t in img_tags:
|
|
||||||
self.cache_this({'img': 'https://www.ldoceonline.com' + t['src']})
|
|
||||||
t.decompose()
|
|
||||||
|
|
||||||
# remove sound tag
|
word_info['ee'] = body_html
|
||||||
am_s_tag = dic_link.find("span", title='Play American pronunciation of {}'.format(self.word))
|
return self.cache_this(word_info)
|
||||||
br_s_tag = dic_link.find("span", title='Play British pronunciation of {}'.format(self.word))
|
|
||||||
if am_s_tag:
|
|
||||||
am_s_tag.decompose()
|
|
||||||
if br_s_tag:
|
|
||||||
br_s_tag.decompose()
|
|
||||||
|
|
||||||
# remove example sound tag
|
|
||||||
emp_s_tags = dic_link.find_all('span', {'class': 'speaker exafile fa fa-volume-up'})
|
|
||||||
for t in emp_s_tags:
|
|
||||||
t.decompose()
|
|
||||||
|
|
||||||
body_html += str(dic_link)
|
|
||||||
ee_ = body_html
|
|
||||||
self.cache_this({
|
|
||||||
'ee': ee_
|
|
||||||
})
|
|
||||||
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
return self.cache_result(single_dict)
|
|
||||||
|
|
||||||
@export(u'音标')
|
@export(u'音标')
|
||||||
def fld_phonetic(self):
|
def fld_phonetic(self):
|
||||||
return self._get_singledict('phonetic')
|
return self._get_field('phonetic')
|
||||||
|
|
||||||
|
def _fld_mp3(self, fld):
|
||||||
|
audio_url = self._get_field(fld)
|
||||||
|
if longman_download_mp3 and audio_url:
|
||||||
|
filename = u'_longman_{}_.mp3'.format(self.word)
|
||||||
|
hex_digest = sha1(
|
||||||
|
self.word.encode('utf-8') if isinstance(self.word, unicode)
|
||||||
|
else self.word
|
||||||
|
).hexdigest().lower()
|
||||||
|
assert len(hex_digest) == 40, "unexpected output from hash library"
|
||||||
|
filename = '.'.join([
|
||||||
|
'-'.join([
|
||||||
|
self.unique.lower(
|
||||||
|
), hex_digest[:8], hex_digest[8:16],
|
||||||
|
hex_digest[16:24], hex_digest[24:32], hex_digest[32:],
|
||||||
|
]),
|
||||||
|
'mp3',
|
||||||
|
])
|
||||||
|
if self.net_download(filename, audio_url):
|
||||||
|
return self.get_anki_label(filename, 'audio')
|
||||||
|
return ''
|
||||||
|
|
||||||
|
@export(u'美音')
|
||||||
|
def fld_mp3_us(self):
|
||||||
|
return self._fld_mp3('am_mp3')
|
||||||
|
|
||||||
|
@export(u'英音')
|
||||||
|
def fld_mp3_uk(self):
|
||||||
|
return self._fld_mp3('br_mp3')
|
||||||
|
|
||||||
@export(u'断字单词')
|
@export(u'断字单词')
|
||||||
def fld_hyphenation(self):
|
def fld_hyphenation(self):
|
||||||
return self._get_singledict('hyphenation')
|
return self._get_field('hyphenation')
|
||||||
|
|
||||||
@export(u'词性')
|
@export(u'词性')
|
||||||
def fld_pos(self):
|
def fld_pos(self):
|
||||||
return self._get_singledict('pos')
|
return self._get_field('pos')
|
||||||
|
|
||||||
@export(u'英英解释')
|
@export(u'英英解释')
|
||||||
@with_styles(cssfile='_longman.css')
|
@with_styles(cssfile='_longman.css')
|
||||||
def fld_ee(self):
|
def fld_ee(self):
|
||||||
return self._get_singledict('ee')
|
return self._get_field('ee')
|
||||||
|
|
||||||
@export(u'图片')
|
|
||||||
def fld_pic(self):
|
|
||||||
url = self._get_singledict('img')
|
|
||||||
filename = u'longman_img_{}'.format(os.path.basename(url))
|
|
||||||
if url and self.download(url, filename):
|
|
||||||
return self.get_anki_label(filename, 'img')
|
|
||||||
return ''
|
|
||||||
|
|
||||||
@export(u'变形')
|
@export(u'变形')
|
||||||
@with_styles(cssfile='_longman.css')
|
@with_styles(cssfile='_longman.css')
|
||||||
def fld_inflections(self):
|
def fld_inflections(self):
|
||||||
return self._get_singledict('inflections')
|
return self._get_field('inflections')
|
||||||
|
|||||||
@ -17,6 +17,7 @@
|
|||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import sys
|
||||||
import inspect
|
import inspect
|
||||||
import os
|
import os
|
||||||
from hashlib import md5
|
from hashlib import md5
|
||||||
@ -26,6 +27,10 @@ from ..context import config
|
|||||||
from ..utils import importlib
|
from ..utils import importlib
|
||||||
|
|
||||||
|
|
||||||
|
reload(sys)
|
||||||
|
sys.setdefaultencoding('utf8')
|
||||||
|
|
||||||
|
|
||||||
class ServiceManager(object):
|
class ServiceManager(object):
|
||||||
"""
|
"""
|
||||||
Query service class manager
|
Query service class manager
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user