From cc8b082e75bc69dca1c33502fe28f28d6aa847cc Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Sun, 14 Apr 2019 13:05:02 +0800 Subject: [PATCH 1/3] Fix sense is incomplete when phrase has multiple senses in cambridge(Test word 'beg'). --- addons21/fastwq/service/dict/cambridge.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 523839a..4733f23 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -88,22 +88,25 @@ class Cambridge(WebService): if sense_body: l = result['def_list'] - for block in sense_body: - if isinstance(block, Tag) is not True: - continue - phrase = None + def extract_sense(block, phrase=None): + if isinstance(block, Tag) is not True: + return + block_type = block['class'][0] if block_type == 'def-block': pass elif block_type == 'phrase-block': - phrase_header = block.find('span', class_='phrase-head') - phrase = phrase_header.get_text() if phrase_header else None - pass + _phrase_header = block.find('span', class_='phrase-head') + _phrase_body = block.find('div', class_='phrase-body pad-indent') + if _phrase_body: + for p_b in _phrase_body: + extract_sense(p_b, _phrase_header.get_text() if _phrase_header else None) + return elif block_type == 'runon-body': pass else: - continue + return span_df = block.find('span', class_='def-info') def_info = (span_df.get_text().replace('›', '') if span_df else '') @@ -125,6 +128,9 @@ class Cambridge(WebService): ) ) ) + + for b in sense_body: + extract_sense(b) result['def'] = u'' img = sense.find('img', class_='lightboxLink') if img: From 525774625c3ad83d0f14f52bb58a03e83927ee5a Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Sun, 14 Apr 2019 13:07:48 +0800 Subject: [PATCH 2/3] Fix result is incomplete that word is 'allocation' from cambridge ee. --- addons21/fastwq/service/dict/cambridge.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index 4733f23..ba4a4db 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -31,7 +31,7 @@ class Cambridge(WebService): } # english - element = soup.find('div', class_='di-body') + element = soup.find('div', class_='page') if element: # 页 elements = element.find_all('div', class_='entry-body__el clrd js-share-holder') @@ -56,18 +56,11 @@ class Cambridge(WebService): header_found = True # 义 - if 'english-chinese-simplified' in self._get_url(): - senses = element.find_all('div', id=re.compile("english-chinese-simplified*")) - elif 'english-chinese-traditional' in self._get_url(): - senses = element.find_all('div', id=re.compile("english-chinese-traditional*")) - else: - senses = element.find_all('div', id=re.compile("cald4*")) - # proficiency之类的词语 - if not senses: - senses = element.find_all('div', id=re.compile("cbed*")) - # shoplift之类的词语 - if not senses: - senses = element.find_all('div', id=re.compile("cacd*")) + senses = element.find_all('div', id=re.compile("english-chinese-simplified*|" + "english-chinese-traditional*|" + "cald4*|" + "cbed*|" + "cacd*")) # 词性 span_posgram = element.find('span', class_='posgram ico-bg') pos_gram = (span_posgram.get_text() if span_posgram else '') From c59c936db03db5644796ddd289a32cf3f0d66010 Mon Sep 17 00:00:00 2001 From: Javan Zhu Date: Sun, 14 Apr 2019 14:33:33 +0800 Subject: [PATCH 3/3] Fix --- addons21/fastwq/service/dict/cambridge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addons21/fastwq/service/dict/cambridge.py b/addons21/fastwq/service/dict/cambridge.py index ba4a4db..f1d0b94 100644 --- a/addons21/fastwq/service/dict/cambridge.py +++ b/addons21/fastwq/service/dict/cambridge.py @@ -31,7 +31,7 @@ class Cambridge(WebService): } # english - element = soup.find('div', class_='page') + element = soup.find('div', class_='cdo-dblclick-area') if element: # 页 elements = element.find_all('div', class_='entry-body__el clrd js-share-holder')