fix #46
This commit is contained in:
parent
7817b677c1
commit
847ca06070
@ -74,3 +74,4 @@ It forks from [WordQuery](https://github.com/finalion/WordQuery), added **multi-
|
||||
- [pystardict](https://github.com/lig/pystardict)
|
||||
- [WordQuery](https://github.com/finalion/WordQuery)
|
||||
- [AnkiHub](https://github.com/dayjaby/AnkiHub)
|
||||
- [snowball_py](https://github.com/shibukawa/snowball_py)
|
||||
|
||||
@ -17,9 +17,12 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
from anki.hooks import addHook
|
||||
from anki.utils import isMac
|
||||
|
||||
sys.dont_write_bytecode = True
|
||||
|
||||
############## other config here ##################
|
||||
shortcut = ('Ctrl+Alt' if isMac else 'Ctrl') + '+Q'
|
||||
###################################################
|
||||
|
||||
27
addons/fastwq/libs/snowballstemmer/__init__.py
Normal file
27
addons/fastwq/libs/snowballstemmer/__init__.py
Normal file
@ -0,0 +1,27 @@
|
||||
__all__ = ('language', 'stemmer')
|
||||
|
||||
from .english_stemmer import EnglishStemmer
|
||||
|
||||
language = {
|
||||
'english': EnglishStemmer,
|
||||
}
|
||||
|
||||
try:
|
||||
import Stemmer
|
||||
cext_available = True
|
||||
except ImportError:
|
||||
cext_available = False
|
||||
|
||||
def algorithms():
|
||||
if cext_available:
|
||||
return Stemmer.language()
|
||||
else:
|
||||
return list(language.keys())
|
||||
|
||||
def stemmer(lang):
|
||||
if cext_available:
|
||||
return Stemmer.Stemmer(lang)
|
||||
if lang.lower() in language:
|
||||
return language[lang.lower()]()
|
||||
else:
|
||||
raise KeyError("Stemming algorithm '%s' not found" % lang)
|
||||
15
addons/fastwq/libs/snowballstemmer/among.py
Normal file
15
addons/fastwq/libs/snowballstemmer/among.py
Normal file
@ -0,0 +1,15 @@
|
||||
|
||||
class Among(object):
|
||||
def __init__(self, s, substring_i, result, method=None):
|
||||
"""
|
||||
@ivar s_size search string size
|
||||
@ivar s search string
|
||||
@ivar substring index to longest matching substring
|
||||
@ivar result of the lookup
|
||||
@ivar method method to use if substring matches
|
||||
"""
|
||||
self.s_size = len(s)
|
||||
self.s = s
|
||||
self.substring_i = substring_i
|
||||
self.result = result
|
||||
self.method = method
|
||||
351
addons/fastwq/libs/snowballstemmer/basestemmer.py
Normal file
351
addons/fastwq/libs/snowballstemmer/basestemmer.py
Normal file
@ -0,0 +1,351 @@
|
||||
class BaseStemmer(object):
|
||||
def __init__(self):
|
||||
self.set_current("")
|
||||
self.maxCacheSize = 10000
|
||||
self._cache = {}
|
||||
self._counter = 0
|
||||
|
||||
def set_current(self, value):
|
||||
'''
|
||||
Set the self.current string.
|
||||
'''
|
||||
self.current = value
|
||||
self.cursor = 0
|
||||
self.limit = len(self.current)
|
||||
self.limit_backward = 0
|
||||
self.bra = self.cursor
|
||||
self.ket = self.limit
|
||||
|
||||
def get_current(self):
|
||||
'''
|
||||
Get the self.current string.
|
||||
'''
|
||||
return self.current
|
||||
|
||||
def copy_from(self, other):
|
||||
self.current = other.current
|
||||
self.cursor = other.cursor
|
||||
self.limit = other.limit
|
||||
self.limit_backward = other.limit_backward
|
||||
self.bra = other.bra
|
||||
self.ket = other.ket
|
||||
|
||||
def in_grouping(self, s, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
||||
return False
|
||||
self.cursor += 1
|
||||
return True
|
||||
|
||||
def in_grouping_b(self, s, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
||||
return False
|
||||
self.cursor -= 1
|
||||
return True
|
||||
|
||||
def out_grouping(self, s, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if ch > max or ch < min:
|
||||
self.cursor += 1
|
||||
return True
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
|
||||
self.cursor += 1
|
||||
return True
|
||||
return False
|
||||
|
||||
def out_grouping_b(self, s, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if ch > max or ch < min:
|
||||
self.cursor -= 1
|
||||
return True
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
|
||||
self.cursor -= 1
|
||||
return True
|
||||
return False
|
||||
|
||||
def in_range(self, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
self.cursor += 1
|
||||
return True
|
||||
|
||||
def in_range_b(self, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
self.cursor -= 1
|
||||
return True
|
||||
|
||||
def out_range(self, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if not (ch > max or ch < min):
|
||||
return False
|
||||
self.cursor += 1
|
||||
return True
|
||||
|
||||
def out_range_b(self, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if not (ch > max or ch < min):
|
||||
return False
|
||||
self.cursor -= 1
|
||||
return True
|
||||
|
||||
def eq_s(self, s_size, s):
|
||||
if self.limit - self.cursor < s_size:
|
||||
return False
|
||||
if self.current[self.cursor:self.cursor + s_size] != s:
|
||||
return False
|
||||
self.cursor += s_size
|
||||
return True
|
||||
|
||||
def eq_s_b(self, s_size, s):
|
||||
if self.cursor - self.limit_backward < s_size:
|
||||
return False
|
||||
if self.current[self.cursor - s_size:self.cursor] != s:
|
||||
return False
|
||||
self.cursor -= s_size
|
||||
return True
|
||||
|
||||
def eq_v(self, s):
|
||||
return self.eq_s(len(s), s)
|
||||
|
||||
def eq_v_b(self, s):
|
||||
return self.eq_s_b(len(s), s)
|
||||
|
||||
def find_among(self, v, v_size):
|
||||
i = 0
|
||||
j = v_size
|
||||
|
||||
c = self.cursor
|
||||
l = self.limit
|
||||
|
||||
common_i = 0
|
||||
common_j = 0
|
||||
|
||||
first_key_inspected = False
|
||||
|
||||
while True:
|
||||
k = i + ((j - i) >> 1)
|
||||
diff = 0
|
||||
common = min(common_i, common_j) # smalle
|
||||
w = v[k]
|
||||
for i2 in range(common, w.s_size):
|
||||
if c + common == l:
|
||||
diff = -1
|
||||
break
|
||||
diff = ord(self.current[c + common]) - ord(w.s[i2])
|
||||
if diff != 0:
|
||||
break
|
||||
common += 1
|
||||
if diff < 0:
|
||||
j = k
|
||||
common_j = common
|
||||
else:
|
||||
i = k
|
||||
common_i = common
|
||||
if j - i <= 1:
|
||||
if i > 0:
|
||||
break # v->s has been inspected
|
||||
if j == i:
|
||||
break # only one item in v
|
||||
# - but now we need to go round once more to get
|
||||
# v->s inspected. self looks messy, but is actually
|
||||
# the optimal approach.
|
||||
if first_key_inspected:
|
||||
break
|
||||
first_key_inspected = True
|
||||
while True:
|
||||
w = v[i]
|
||||
if common_i >= w.s_size:
|
||||
self.cursor = c + w.s_size
|
||||
if w.method is None:
|
||||
return w.result
|
||||
method = getattr(self, w.method)
|
||||
res = method()
|
||||
self.cursor = c + w.s_size
|
||||
if res:
|
||||
return w.result
|
||||
i = w.substring_i
|
||||
if i < 0:
|
||||
return 0
|
||||
return -1 # not reachable
|
||||
|
||||
def find_among_b(self, v, v_size):
|
||||
'''
|
||||
find_among_b is for backwards processing. Same comments apply
|
||||
'''
|
||||
i = 0
|
||||
j = v_size
|
||||
|
||||
c = self.cursor
|
||||
lb = self.limit_backward;
|
||||
|
||||
common_i = 0
|
||||
common_j = 0
|
||||
|
||||
first_key_inspected = False
|
||||
|
||||
while True:
|
||||
k = i + ((j - i) >> 1)
|
||||
diff = 0
|
||||
common = min(common_i, common_j)
|
||||
w = v[k]
|
||||
for i2 in range(w.s_size - 1 - common, -1, -1):
|
||||
if c - common == lb:
|
||||
diff = -1
|
||||
break
|
||||
diff = ord(self.current[c - 1 - common]) - ord(w.s[i2])
|
||||
if diff != 0:
|
||||
break
|
||||
common += 1
|
||||
if diff < 0:
|
||||
j = k
|
||||
common_j = common
|
||||
else:
|
||||
i = k
|
||||
common_i = common
|
||||
if j - i <= 1:
|
||||
if i > 0:
|
||||
break
|
||||
if j == i:
|
||||
break
|
||||
if first_key_inspected:
|
||||
break
|
||||
first_key_inspected = True
|
||||
while True:
|
||||
w = v[i]
|
||||
if common_i >= w.s_size:
|
||||
self.cursor = c - w.s_size
|
||||
if w.method is None:
|
||||
return w.result
|
||||
method = getattr(self, w.method)
|
||||
res = method()
|
||||
self.cursor = c - w.s_size
|
||||
if res:
|
||||
return w.result
|
||||
i = w.substring_i
|
||||
if i < 0:
|
||||
return 0
|
||||
return -1 # not reachable
|
||||
|
||||
def replace_s(self, c_bra, c_ket, s):
|
||||
'''
|
||||
to replace chars between c_bra and c_ket in self.current by the
|
||||
chars in s.
|
||||
|
||||
@type c_bra int
|
||||
@type c_ket int
|
||||
@type s: string
|
||||
'''
|
||||
adjustment = len(s) - (c_ket - c_bra)
|
||||
self.current = self.current[0:c_bra] + s + self.current[c_ket:]
|
||||
self.limit += adjustment
|
||||
if self.cursor >= c_ket:
|
||||
self.cursor += adjustment
|
||||
elif self.cursor > c_bra:
|
||||
self.cursor = c_bra
|
||||
return adjustment
|
||||
|
||||
def slice_check(self):
|
||||
if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current):
|
||||
return False
|
||||
return True
|
||||
|
||||
def slice_from(self, s):
|
||||
'''
|
||||
@type s string
|
||||
'''
|
||||
result = False
|
||||
if self.slice_check():
|
||||
self.replace_s(self.bra, self.ket, s)
|
||||
result = True
|
||||
return result
|
||||
|
||||
def slice_del(self):
|
||||
return self.slice_from("")
|
||||
|
||||
def insert(self, c_bra, c_ket, s):
|
||||
'''
|
||||
@type c_bra int
|
||||
@type c_ket int
|
||||
@type s: string
|
||||
'''
|
||||
adjustment = self.replace_s(c_bra, c_ket, s)
|
||||
if c_bra <= self.bra:
|
||||
self.bra += adjustment
|
||||
if c_bra <= self.ket:
|
||||
self.ket += adjustment
|
||||
|
||||
def slice_to(self, s):
|
||||
'''
|
||||
Copy the slice into the supplied StringBuffer
|
||||
|
||||
@type s: string
|
||||
'''
|
||||
result = ''
|
||||
if self.slice_check():
|
||||
result = self.current[self.bra:self.ket]
|
||||
return result
|
||||
|
||||
def assign_to(self, s):
|
||||
'''
|
||||
@type s: string
|
||||
'''
|
||||
return self.current[0:self.limit]
|
||||
|
||||
def _stem_word(self, word):
|
||||
cache = self._cache.get(word)
|
||||
if cache is None:
|
||||
self.set_current(word)
|
||||
self._stem()
|
||||
result = self.get_current()
|
||||
self._cache[word] = [result, self._counter]
|
||||
else:
|
||||
cache[1] = self._counter
|
||||
result = cache[0]
|
||||
self._counter += 1
|
||||
return result
|
||||
|
||||
def _clear_cache(self):
|
||||
removecount = int(len(self._cache) - self.maxCacheSize * 8 / 10)
|
||||
oldcaches = sorted(self._cache.items(), key=lambda cache: cache[1][1])[0:removecount]
|
||||
for key, value in oldcaches:
|
||||
del self._cache[key]
|
||||
|
||||
def stemWord(self, word):
|
||||
result = self._stem_word(word)
|
||||
if len(self._cache) > self.maxCacheSize:
|
||||
self._clear_cache()
|
||||
return result
|
||||
|
||||
def stemWords(self, words):
|
||||
result = [self._stem_word(word) for word in words]
|
||||
if len(self._cache) > self.maxCacheSize:
|
||||
self._clear_cache()
|
||||
return result
|
||||
1115
addons/fastwq/libs/snowballstemmer/english_stemmer.py
Normal file
1115
addons/fastwq/libs/snowballstemmer/english_stemmer.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -33,6 +33,7 @@ from ..context import config
|
||||
from ..service import service_pool, QueryResult, copy_static_file
|
||||
from ..service.base import LocalService
|
||||
from ..utils import wrap_css
|
||||
from ..libs.snowballstemmer import stemmer
|
||||
|
||||
|
||||
__all__ = [
|
||||
@ -263,14 +264,27 @@ def query_flds(note, fileds=None):
|
||||
|
||||
def cloze_deletion(text, term):
|
||||
'''create cloze deletion text'''
|
||||
text = text.replace('’', '\'')
|
||||
result = text
|
||||
words = re.finditer(r"\b" + re.escape(term) + r"\b", text, flags=re.IGNORECASE)
|
||||
words = [m.start() for m in words][::-1]
|
||||
index = 1
|
||||
for word in words:
|
||||
if not text[word - 1].isalnum() or text[word + len(term)].isalnum():
|
||||
if not "{{" in text[word:word + len(term)] or "}}" in text[word:word + len(term)]:
|
||||
result = result[:word + len(term)] + "}}" + result[word + len(term):]
|
||||
result = result[:word] + "{{c" + str(index) + "::" + result[word:]
|
||||
#index += 1
|
||||
offset = 0
|
||||
term = _stemmer.stemWord(term).lower()
|
||||
|
||||
terms = re.finditer(r"\b[\w'-]*\b", text)
|
||||
tags = re.finditer(r"<[^>]+>", text)
|
||||
for m in terms:
|
||||
s = m.start()
|
||||
e = m.end()
|
||||
f = False
|
||||
for tag in tags:
|
||||
if s >= tag.start() and e <= tag.end():
|
||||
f = True
|
||||
break
|
||||
if f:
|
||||
continue
|
||||
word = text[s:e]
|
||||
if _stemmer.stemWord(word).lower() == term:
|
||||
result = result[:s+offset] + "{{c1::" + word + "}}" + result[e+offset:]
|
||||
offset += 8
|
||||
return result
|
||||
|
||||
_stemmer = stemmer('english')
|
||||
|
||||
@ -17,9 +17,12 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
from anki.hooks import addHook
|
||||
from anki.utils import isMac
|
||||
|
||||
sys.dont_write_bytecode = True
|
||||
|
||||
############## other config here ##################
|
||||
shortcut = ('Ctrl+Alt' if isMac else 'Ctrl') + '+Q'
|
||||
###################################################
|
||||
|
||||
27
addons21/fastwq/libs/snowballstemmer/__init__.py
Normal file
27
addons21/fastwq/libs/snowballstemmer/__init__.py
Normal file
@ -0,0 +1,27 @@
|
||||
__all__ = ('language', 'stemmer')
|
||||
|
||||
from .english_stemmer import EnglishStemmer
|
||||
|
||||
language = {
|
||||
'english': EnglishStemmer,
|
||||
}
|
||||
|
||||
try:
|
||||
import Stemmer
|
||||
cext_available = True
|
||||
except ImportError:
|
||||
cext_available = False
|
||||
|
||||
def algorithms():
|
||||
if cext_available:
|
||||
return Stemmer.language()
|
||||
else:
|
||||
return list(language.keys())
|
||||
|
||||
def stemmer(lang):
|
||||
if cext_available:
|
||||
return Stemmer.Stemmer(lang)
|
||||
if lang.lower() in language:
|
||||
return language[lang.lower()]()
|
||||
else:
|
||||
raise KeyError("Stemming algorithm '%s' not found" % lang)
|
||||
15
addons21/fastwq/libs/snowballstemmer/among.py
Normal file
15
addons21/fastwq/libs/snowballstemmer/among.py
Normal file
@ -0,0 +1,15 @@
|
||||
|
||||
class Among(object):
|
||||
def __init__(self, s, substring_i, result, method=None):
|
||||
"""
|
||||
@ivar s_size search string size
|
||||
@ivar s search string
|
||||
@ivar substring index to longest matching substring
|
||||
@ivar result of the lookup
|
||||
@ivar method method to use if substring matches
|
||||
"""
|
||||
self.s_size = len(s)
|
||||
self.s = s
|
||||
self.substring_i = substring_i
|
||||
self.result = result
|
||||
self.method = method
|
||||
351
addons21/fastwq/libs/snowballstemmer/basestemmer.py
Normal file
351
addons21/fastwq/libs/snowballstemmer/basestemmer.py
Normal file
@ -0,0 +1,351 @@
|
||||
class BaseStemmer(object):
|
||||
def __init__(self):
|
||||
self.set_current("")
|
||||
self.maxCacheSize = 10000
|
||||
self._cache = {}
|
||||
self._counter = 0
|
||||
|
||||
def set_current(self, value):
|
||||
'''
|
||||
Set the self.current string.
|
||||
'''
|
||||
self.current = value
|
||||
self.cursor = 0
|
||||
self.limit = len(self.current)
|
||||
self.limit_backward = 0
|
||||
self.bra = self.cursor
|
||||
self.ket = self.limit
|
||||
|
||||
def get_current(self):
|
||||
'''
|
||||
Get the self.current string.
|
||||
'''
|
||||
return self.current
|
||||
|
||||
def copy_from(self, other):
|
||||
self.current = other.current
|
||||
self.cursor = other.cursor
|
||||
self.limit = other.limit
|
||||
self.limit_backward = other.limit_backward
|
||||
self.bra = other.bra
|
||||
self.ket = other.ket
|
||||
|
||||
def in_grouping(self, s, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
||||
return False
|
||||
self.cursor += 1
|
||||
return True
|
||||
|
||||
def in_grouping_b(self, s, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0x1 << (ch & 0x7))) == 0:
|
||||
return False
|
||||
self.cursor -= 1
|
||||
return True
|
||||
|
||||
def out_grouping(self, s, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if ch > max or ch < min:
|
||||
self.cursor += 1
|
||||
return True
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
|
||||
self.cursor += 1
|
||||
return True
|
||||
return False
|
||||
|
||||
def out_grouping_b(self, s, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if ch > max or ch < min:
|
||||
self.cursor -= 1
|
||||
return True
|
||||
ch -= min
|
||||
if (s[ch >> 3] & (0X1 << (ch & 0x7))) == 0:
|
||||
self.cursor -= 1
|
||||
return True
|
||||
return False
|
||||
|
||||
def in_range(self, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
self.cursor += 1
|
||||
return True
|
||||
|
||||
def in_range_b(self, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if ch > max or ch < min:
|
||||
return False
|
||||
self.cursor -= 1
|
||||
return True
|
||||
|
||||
def out_range(self, min, max):
|
||||
if self.cursor >= self.limit:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor])
|
||||
if not (ch > max or ch < min):
|
||||
return False
|
||||
self.cursor += 1
|
||||
return True
|
||||
|
||||
def out_range_b(self, min, max):
|
||||
if self.cursor <= self.limit_backward:
|
||||
return False
|
||||
ch = ord(self.current[self.cursor - 1])
|
||||
if not (ch > max or ch < min):
|
||||
return False
|
||||
self.cursor -= 1
|
||||
return True
|
||||
|
||||
def eq_s(self, s_size, s):
|
||||
if self.limit - self.cursor < s_size:
|
||||
return False
|
||||
if self.current[self.cursor:self.cursor + s_size] != s:
|
||||
return False
|
||||
self.cursor += s_size
|
||||
return True
|
||||
|
||||
def eq_s_b(self, s_size, s):
|
||||
if self.cursor - self.limit_backward < s_size:
|
||||
return False
|
||||
if self.current[self.cursor - s_size:self.cursor] != s:
|
||||
return False
|
||||
self.cursor -= s_size
|
||||
return True
|
||||
|
||||
def eq_v(self, s):
|
||||
return self.eq_s(len(s), s)
|
||||
|
||||
def eq_v_b(self, s):
|
||||
return self.eq_s_b(len(s), s)
|
||||
|
||||
def find_among(self, v, v_size):
|
||||
i = 0
|
||||
j = v_size
|
||||
|
||||
c = self.cursor
|
||||
l = self.limit
|
||||
|
||||
common_i = 0
|
||||
common_j = 0
|
||||
|
||||
first_key_inspected = False
|
||||
|
||||
while True:
|
||||
k = i + ((j - i) >> 1)
|
||||
diff = 0
|
||||
common = min(common_i, common_j) # smalle
|
||||
w = v[k]
|
||||
for i2 in range(common, w.s_size):
|
||||
if c + common == l:
|
||||
diff = -1
|
||||
break
|
||||
diff = ord(self.current[c + common]) - ord(w.s[i2])
|
||||
if diff != 0:
|
||||
break
|
||||
common += 1
|
||||
if diff < 0:
|
||||
j = k
|
||||
common_j = common
|
||||
else:
|
||||
i = k
|
||||
common_i = common
|
||||
if j - i <= 1:
|
||||
if i > 0:
|
||||
break # v->s has been inspected
|
||||
if j == i:
|
||||
break # only one item in v
|
||||
# - but now we need to go round once more to get
|
||||
# v->s inspected. self looks messy, but is actually
|
||||
# the optimal approach.
|
||||
if first_key_inspected:
|
||||
break
|
||||
first_key_inspected = True
|
||||
while True:
|
||||
w = v[i]
|
||||
if common_i >= w.s_size:
|
||||
self.cursor = c + w.s_size
|
||||
if w.method is None:
|
||||
return w.result
|
||||
method = getattr(self, w.method)
|
||||
res = method()
|
||||
self.cursor = c + w.s_size
|
||||
if res:
|
||||
return w.result
|
||||
i = w.substring_i
|
||||
if i < 0:
|
||||
return 0
|
||||
return -1 # not reachable
|
||||
|
||||
def find_among_b(self, v, v_size):
|
||||
'''
|
||||
find_among_b is for backwards processing. Same comments apply
|
||||
'''
|
||||
i = 0
|
||||
j = v_size
|
||||
|
||||
c = self.cursor
|
||||
lb = self.limit_backward;
|
||||
|
||||
common_i = 0
|
||||
common_j = 0
|
||||
|
||||
first_key_inspected = False
|
||||
|
||||
while True:
|
||||
k = i + ((j - i) >> 1)
|
||||
diff = 0
|
||||
common = min(common_i, common_j)
|
||||
w = v[k]
|
||||
for i2 in range(w.s_size - 1 - common, -1, -1):
|
||||
if c - common == lb:
|
||||
diff = -1
|
||||
break
|
||||
diff = ord(self.current[c - 1 - common]) - ord(w.s[i2])
|
||||
if diff != 0:
|
||||
break
|
||||
common += 1
|
||||
if diff < 0:
|
||||
j = k
|
||||
common_j = common
|
||||
else:
|
||||
i = k
|
||||
common_i = common
|
||||
if j - i <= 1:
|
||||
if i > 0:
|
||||
break
|
||||
if j == i:
|
||||
break
|
||||
if first_key_inspected:
|
||||
break
|
||||
first_key_inspected = True
|
||||
while True:
|
||||
w = v[i]
|
||||
if common_i >= w.s_size:
|
||||
self.cursor = c - w.s_size
|
||||
if w.method is None:
|
||||
return w.result
|
||||
method = getattr(self, w.method)
|
||||
res = method()
|
||||
self.cursor = c - w.s_size
|
||||
if res:
|
||||
return w.result
|
||||
i = w.substring_i
|
||||
if i < 0:
|
||||
return 0
|
||||
return -1 # not reachable
|
||||
|
||||
def replace_s(self, c_bra, c_ket, s):
|
||||
'''
|
||||
to replace chars between c_bra and c_ket in self.current by the
|
||||
chars in s.
|
||||
|
||||
@type c_bra int
|
||||
@type c_ket int
|
||||
@type s: string
|
||||
'''
|
||||
adjustment = len(s) - (c_ket - c_bra)
|
||||
self.current = self.current[0:c_bra] + s + self.current[c_ket:]
|
||||
self.limit += adjustment
|
||||
if self.cursor >= c_ket:
|
||||
self.cursor += adjustment
|
||||
elif self.cursor > c_bra:
|
||||
self.cursor = c_bra
|
||||
return adjustment
|
||||
|
||||
def slice_check(self):
|
||||
if self.bra < 0 or self.bra > self.ket or self.ket > self.limit or self.limit > len(self.current):
|
||||
return False
|
||||
return True
|
||||
|
||||
def slice_from(self, s):
|
||||
'''
|
||||
@type s string
|
||||
'''
|
||||
result = False
|
||||
if self.slice_check():
|
||||
self.replace_s(self.bra, self.ket, s)
|
||||
result = True
|
||||
return result
|
||||
|
||||
def slice_del(self):
|
||||
return self.slice_from("")
|
||||
|
||||
def insert(self, c_bra, c_ket, s):
|
||||
'''
|
||||
@type c_bra int
|
||||
@type c_ket int
|
||||
@type s: string
|
||||
'''
|
||||
adjustment = self.replace_s(c_bra, c_ket, s)
|
||||
if c_bra <= self.bra:
|
||||
self.bra += adjustment
|
||||
if c_bra <= self.ket:
|
||||
self.ket += adjustment
|
||||
|
||||
def slice_to(self, s):
|
||||
'''
|
||||
Copy the slice into the supplied StringBuffer
|
||||
|
||||
@type s: string
|
||||
'''
|
||||
result = ''
|
||||
if self.slice_check():
|
||||
result = self.current[self.bra:self.ket]
|
||||
return result
|
||||
|
||||
def assign_to(self, s):
|
||||
'''
|
||||
@type s: string
|
||||
'''
|
||||
return self.current[0:self.limit]
|
||||
|
||||
def _stem_word(self, word):
|
||||
cache = self._cache.get(word)
|
||||
if cache is None:
|
||||
self.set_current(word)
|
||||
self._stem()
|
||||
result = self.get_current()
|
||||
self._cache[word] = [result, self._counter]
|
||||
else:
|
||||
cache[1] = self._counter
|
||||
result = cache[0]
|
||||
self._counter += 1
|
||||
return result
|
||||
|
||||
def _clear_cache(self):
|
||||
removecount = int(len(self._cache) - self.maxCacheSize * 8 / 10)
|
||||
oldcaches = sorted(self._cache.items(), key=lambda cache: cache[1][1])[0:removecount]
|
||||
for key, value in oldcaches:
|
||||
del self._cache[key]
|
||||
|
||||
def stemWord(self, word):
|
||||
result = self._stem_word(word)
|
||||
if len(self._cache) > self.maxCacheSize:
|
||||
self._clear_cache()
|
||||
return result
|
||||
|
||||
def stemWords(self, words):
|
||||
result = [self._stem_word(word) for word in words]
|
||||
if len(self._cache) > self.maxCacheSize:
|
||||
self._clear_cache()
|
||||
return result
|
||||
1115
addons21/fastwq/libs/snowballstemmer/english_stemmer.py
Normal file
1115
addons21/fastwq/libs/snowballstemmer/english_stemmer.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -33,6 +33,7 @@ from ..context import config
|
||||
from ..service import service_pool, QueryResult, copy_static_file
|
||||
from ..service.base import LocalService
|
||||
from ..utils import wrap_css
|
||||
from ..libs.snowballstemmer import stemmer
|
||||
|
||||
|
||||
__all__ = [
|
||||
@ -264,14 +265,27 @@ def query_flds(note, fileds=None):
|
||||
|
||||
def cloze_deletion(text, term):
|
||||
'''create cloze deletion text'''
|
||||
text = text.replace('’', '\'')
|
||||
result = text
|
||||
words = re.finditer(r"\b" + re.escape(term) + r"\b", text, flags=re.IGNORECASE)
|
||||
words = [m.start() for m in words][::-1]
|
||||
index = 1
|
||||
for word in words:
|
||||
if not text[word - 1].isalnum() or text[word + len(term)].isalnum():
|
||||
if not "{{" in text[word:word + len(term)] or "}}" in text[word:word + len(term)]:
|
||||
result = result[:word + len(term)] + "}}" + result[word + len(term):]
|
||||
result = result[:word] + "{{c" + str(index) + "::" + result[word:]
|
||||
#index += 1
|
||||
offset = 0
|
||||
term = _stemmer.stemWord(term).lower()
|
||||
|
||||
terms = re.finditer(r"\b[\w'-]*\b", text)
|
||||
tags = re.finditer(r"<[^>]+>", text)
|
||||
for m in terms:
|
||||
s = m.start()
|
||||
e = m.end()
|
||||
f = False
|
||||
for tag in tags:
|
||||
if s >= tag.start() and e <= tag.end():
|
||||
f = True
|
||||
break
|
||||
if f:
|
||||
continue
|
||||
word = text[s:e]
|
||||
if _stemmer.stemWord(word).lower() == term:
|
||||
result = result[:s+offset] + "{{c1::" + word + "}}" + result[e+offset:]
|
||||
offset += 8
|
||||
return result
|
||||
|
||||
_stemmer = stemmer('english')
|
||||
|
||||
Loading…
Reference in New Issue
Block a user