From e84f3f13a6b4b932c9824663fcc69f179ae03323 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 19:48:51 +0800 Subject: [PATCH 01/10] Fix bug that definition is incomplete when query word like 'ambivalent' from cambridge. --- addons21/fastwq/service/dict/cambridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 41c07a0..238b46f 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -56,7 +56,7 @@ class Cambridge(WebService): pg = element.find('span', class_='posgram ico-bg') # 义 - senses = element.find_all('div', class_='sense-block') + senses = element.find_all('div', class_='pos-body') if senses: for sense in senses: dbs = sense.find_all('div', class_='def-block pad-indent') From 2994cf778adf2a106a0ef4d37e25fd8a60a71882 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 20:43:02 +0800 Subject: [PATCH 02/10] Fix pos is wrong when query word like 'ambivalent' from cambridge. --- addons21/fastwq/service/dict/cambridge.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 238b46f..8243b3b 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -1,4 +1,4 @@ -#-*- coding:utf-8 -*- +# -*- coding:utf-8 -*- import os import re @@ -52,14 +52,24 @@ class Cambridge(WebService): if snd: result['pronunciation'][pn+'mp3'] = cambridge_url_base + snd.get('data-src-mp3') header_found = True - # 词性 - pg = element.find('span', class_='posgram ico-bg') # 义 - senses = element.find_all('div', class_='pos-body') + senses = element.find_all('div', id=re.compile("english-chinese-simplified*")) + # 词性 + pos = element.find('span', class_='pos') + gram = element.find('span', class_='gram') + pos_gram = (pos.get_text() if pos else '') + (gram.get_text() if gram else '') + if senses: for sense in senses: + # 像ambivalent之类词语含有ambivalence解释,词性不同 + pos_2 = sense.find('span', class_='pos') + gram_2 = sense.find('span', class_='gram') + if pos_2 is not None: + pos_gram = (pos_2.get_text() if pos_2 else '') + (gram_2.get_text() if gram else '') + dbs = sense.find_all('div', class_='def-block pad-indent') + if dbs: l = result['def_list'] for db in dbs: @@ -69,7 +79,7 @@ class Cambridge(WebService): examps = db.find_all('div', class_='examp emphasized') l.append( u'
  • {0}{1}{2} {3}{4}
  • '.format( - '{0}'.format(pg.get_text() if pg else ''), + '{0}'.format(pos_gram), u'{0}'.format(i.get_text()) if i else u'', u'{0}'.format(d.get_text()) if d else u'', From efd1c84a326a6f8407da12313a9cd838d4228aa2 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 21:02:25 +0800 Subject: [PATCH 03/10] Add phrase name field to word sense of Cambridge. --- addons21/fastwq/service/dict/cambridge.py | 40 ++++++++++++++++------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 8243b3b..42be6a8 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -2,6 +2,8 @@ import os import re +from bs4 import Tag + from ..base import * cambridge_url_base = u'https://dictionary.cambridge.org/' @@ -68,21 +70,35 @@ class Cambridge(WebService): if pos_2 is not None: pos_gram = (pos_2.get_text() if pos_2 else '') + (gram_2.get_text() if gram else '') - dbs = sense.find_all('div', class_='def-block pad-indent') + sense_body = sense.find('div', class_='sense-body') - if dbs: + if sense_body: l = result['def_list'] - for db in dbs: - i = sense.find('span', class_='def-info') - d = db.find('b', class_='def') - tran = db.find('span', class_='trans') - examps = db.find_all('div', class_='examp emphasized') - l.append( - u'
  • {0}{1}{2} {3}{4}
  • '.format( - '{0}'.format(pos_gram), + phrase_name = None + for block in sense_body: + if isinstance(block, Tag) is not True: + continue - u'{0}'.format(i.get_text()) if i else u'', - u'{0}'.format(d.get_text()) if d else u'', + block_type = block['class'][0] + if block_type == 'def-block': + pass + elif block_type == 'phrase-block': + phrase_title = block.find('span', class_='phrase-title') + phrase_name = phrase_title.get_text() if phrase_title else None + pass + else: + continue + + i = block.find('span', class_='def-info') + d = block.find('b', class_='def') + tran = block.find('span', class_='trans') + examps = block.find_all('div', class_='examp emphasized') + l.append( + u'
  • {0}{1}{2}{3} {4}{5}
  • '.format( + '{0}'.format(pos_gram), + '{0}'.format(phrase_name) if phrase_name else '', + '{0}'.format(i.get_text()) if i else '', + '{0}'.format(d.get_text()) if d else u'', u'{0}'.format(tran.get_text()) if tran else u'', u''.join( From e3e76e22dbca89caae89734686093a4c23c605b3 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 21:16:54 +0800 Subject: [PATCH 04/10] Fix fail to query word from cambridge ee. --- addons21/fastwq/service/dict/cambridge.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 42be6a8..927c199 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -56,7 +56,12 @@ class Cambridge(WebService): header_found = True # 义 - senses = element.find_all('div', id=re.compile("english-chinese-simplified*")) + if 'english-chinese-simplified' in self._get_url(): + senses = element.find_all('div', id=re.compile("english-chinese-simplified*")) + elif 'english-chinese-traditional' in self._get_url(): + senses = element.find_all('div', id=re.compile("english-chinese-traditional*")) + else: + senses = element.find_all('div', id=re.compile("cald4*")) # 词性 pos = element.find('span', class_='pos') gram = element.find('span', class_='gram') @@ -68,9 +73,9 @@ class Cambridge(WebService): pos_2 = sense.find('span', class_='pos') gram_2 = sense.find('span', class_='gram') if pos_2 is not None: - pos_gram = (pos_2.get_text() if pos_2 else '') + (gram_2.get_text() if gram else '') + pos_gram = (pos_2.get_text() if pos_2 else '') + (gram_2.get_text() if gram_2 else '') - sense_body = sense.find('div', class_='sense-body') + sense_body = sense.find('div', class_=re.compile("sense-body|runon-body pad-indent")) if sense_body: l = result['def_list'] @@ -86,6 +91,8 @@ class Cambridge(WebService): phrase_title = block.find('span', class_='phrase-title') phrase_name = phrase_title.get_text() if phrase_title else None pass + elif block_type == 'runon-body': + pass else: continue @@ -100,7 +107,7 @@ class Cambridge(WebService): '{0}'.format(i.get_text()) if i else '', '{0}'.format(d.get_text()) if d else u'', - u'{0}'.format(tran.get_text()) if tran else u'', + '{0}'.format(tran.get_text()) if tran else '', u''.join( u'
    {0}
    '.format(e.get_text()) if e else u'' for e in examps From bb106a0d61397154f78c988f1e912668c75b42d9 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 21:37:18 +0800 Subject: [PATCH 05/10] Bugs fix; Remove pos_gram span if pos_gram is blank. --- addons21/fastwq/service/dict/cambridge.py | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 927c199..0029ddc 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -63,23 +63,23 @@ class Cambridge(WebService): else: senses = element.find_all('div', id=re.compile("cald4*")) # 词性 - pos = element.find('span', class_='pos') - gram = element.find('span', class_='gram') - pos_gram = (pos.get_text() if pos else '') + (gram.get_text() if gram else '') + span_posgram = element.find('span', class_='posgram ico-bg') + pos_gram = (span_posgram.get_text() if span_posgram else '') if senses: for sense in senses: # 像ambivalent之类词语含有ambivalence解释,词性不同 - pos_2 = sense.find('span', class_='pos') - gram_2 = sense.find('span', class_='gram') - if pos_2 is not None: - pos_gram = (pos_2.get_text() if pos_2 else '') + (gram_2.get_text() if gram_2 else '') + if sense['class'][0] == 'runon': + runon_pos = sense.find('span', class_='pos') + runon_gram = sense.find('span', class_='gram') + if runon_pos is not None: + pos_gram = (runon_pos.get_text() if runon_pos else '') + (runon_gram.get_text() if runon_gram else '') sense_body = sense.find('div', class_=re.compile("sense-body|runon-body pad-indent")) if sense_body: l = result['def_list'] - phrase_name = None + phrase = None for block in sense_body: if isinstance(block, Tag) is not True: continue @@ -88,8 +88,8 @@ class Cambridge(WebService): if block_type == 'def-block': pass elif block_type == 'phrase-block': - phrase_title = block.find('span', class_='phrase-title') - phrase_name = phrase_title.get_text() if phrase_title else None + phrase_header = block.find('span', class_='phrase-head') + phrase = phrase_header.get_text() if phrase_header else None pass elif block_type == 'runon-body': pass @@ -102,8 +102,8 @@ class Cambridge(WebService): examps = block.find_all('div', class_='examp emphasized') l.append( u'
  • {0}{1}{2}{3} {4}{5}
  • '.format( - '{0}'.format(pos_gram), - '{0}'.format(phrase_name) if phrase_name else '', + '{0}'.format(pos_gram) if pos_gram != "" else '', + '{0}'.format(phrase) if phrase else '', '{0}'.format(i.get_text()) if i else '', '{0}'.format(d.get_text()) if d else u'', From 9340b42f2c25f64e8ce540c83b90fc73c7adddc9 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 21:45:25 +0800 Subject: [PATCH 06/10] =?UTF-8?q?Remove=20unnecessary=20symbol=20'?= =?UTF-8?q?=E2=80=BA'=20from=20def-info(cambridge).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- addons21/fastwq/service/dict/cambridge.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 0029ddc..fcd0af2 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -96,15 +96,16 @@ class Cambridge(WebService): else: continue - i = block.find('span', class_='def-info') + span_df = block.find('span', class_='def-info') + def_info = (span_df.get_text().replace('›', '') if span_df else '') d = block.find('b', class_='def') tran = block.find('span', class_='trans') examps = block.find_all('div', class_='examp emphasized') l.append( u'
  • {0}{1}{2}{3} {4}{5}
  • '.format( - '{0}'.format(pos_gram) if pos_gram != "" else '', + '{0}'.format(pos_gram) if pos_gram != '' else '', '{0}'.format(phrase) if phrase else '', - '{0}'.format(i.get_text()) if i else '', + '{0}'.format(def_info) if def_info.strip() != '' else '', '{0}'.format(d.get_text()) if d else u'', '{0}'.format(tran.get_text()) if tran else '', From bb9e2360e868bb09be36c6643c25f2282bff165d Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 21:56:20 +0800 Subject: [PATCH 07/10] Add runon_title field to word sense if exist(Cambridge). --- addons21/fastwq/service/dict/cambridge.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index fcd0af2..54a2505 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -69,11 +69,14 @@ class Cambridge(WebService): if senses: for sense in senses: # 像ambivalent之类词语含有ambivalence解释,词性不同 + runon_title = None if sense['class'][0] == 'runon': runon_pos = sense.find('span', class_='pos') runon_gram = sense.find('span', class_='gram') if runon_pos is not None: - pos_gram = (runon_pos.get_text() if runon_pos else '') + (runon_gram.get_text() if runon_gram else '') + pos_gram = runon_pos.get_text() + (runon_gram.get_text() if runon_gram else '') + h3_rt = sense.find('h3', class_='runon-title') + runon_title = (h3_rt.get_text() if h3_rt else None) sense_body = sense.find('div', class_=re.compile("sense-body|runon-body pad-indent")) @@ -102,8 +105,9 @@ class Cambridge(WebService): tran = block.find('span', class_='trans') examps = block.find_all('div', class_='examp emphasized') l.append( - u'
  • {0}{1}{2}{3} {4}{5}
  • '.format( + u'
  • {0}{1}{2}{3}{4} {5}{6}
  • '.format( '{0}'.format(pos_gram) if pos_gram != '' else '', + '{0}'.format(runon_title) if runon_title else '', '{0}'.format(phrase) if phrase else '', '{0}'.format(def_info) if def_info.strip() != '' else '', '{0}'.format(d.get_text()) if d else u'', From 5533c5d783175ca673ce21506b8bf40f843a2dad Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 22:05:26 +0800 Subject: [PATCH 08/10] Fix bug that add duplicate phrase field to def-block when query word like 'charge'(Cambridge). --- addons21/fastwq/service/dict/cambridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 54a2505..53fd409 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -82,11 +82,11 @@ class Cambridge(WebService): if sense_body: l = result['def_list'] - phrase = None for block in sense_body: if isinstance(block, Tag) is not True: continue + phrase = None block_type = block['class'][0] if block_type == 'def-block': pass From dbcf26eaa713cca36438c2e797ba8ac2a7214ea7 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 22:07:40 +0800 Subject: [PATCH 09/10] Fix fail to query word 'proficiency' from Cambridge ee. --- addons21/fastwq/service/dict/cambridge.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 53fd409..e587f86 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -62,6 +62,9 @@ class Cambridge(WebService): senses = element.find_all('div', id=re.compile("english-chinese-traditional*")) else: senses = element.find_all('div', id=re.compile("cald4*")) + # proficiency之类的词语 + if not senses: + senses = element.find_all('div', id=re.compile("cbed*")) # 词性 span_posgram = element.find('span', class_='posgram ico-bg') pos_gram = (span_posgram.get_text() if span_posgram else '') From e7d422e1ed2574e73d9031e66290f9194a6e2d74 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Fri, 12 Apr 2019 22:10:42 +0800 Subject: [PATCH 10/10] Fix fail to query word 'shoplift' from Cambridge ee. --- addons21/fastwq/service/dict/cambridge.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index e587f86..523839a 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -65,6 +65,9 @@ class Cambridge(WebService): # proficiency之类的词语 if not senses: senses = element.find_all('div', id=re.compile("cbed*")) + # shoplift之类的词语 + if not senses: + senses = element.find_all('div', id=re.compile("cacd*")) # 词性 span_posgram = element.find('span', class_='posgram ico-bg') pos_gram = (span_posgram.get_text() if span_posgram else '')