mirror of https://github.com/zlatinb/muwire
modify indexing and search logic to account for phrases
parent
7e881f1fe6
commit
8dcba7535c
|
@ -31,25 +31,48 @@ class SearchIndex {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String[] split(String source) {
|
private static String[] split(final String source) {
|
||||||
source = source.replaceAll(SplitPattern.SPLIT_PATTERN, " ").toLowerCase()
|
// first split by split pattern
|
||||||
String [] split = source.split(" ")
|
String sourceSplit = source.replaceAll(SplitPattern.SPLIT_PATTERN, " ").toLowerCase()
|
||||||
|
String [] split = sourceSplit.split(" ")
|
||||||
def rv = []
|
def rv = []
|
||||||
split.each { if (it.length() > 0) rv << it }
|
split.each { if (it.length() > 0) rv << it }
|
||||||
|
|
||||||
|
// then just by ' '
|
||||||
|
source.split(' ').each { if (it.length() > 0) rv << it }
|
||||||
|
|
||||||
|
// and add original string
|
||||||
|
rv << source
|
||||||
rv.toArray(new String[0])
|
rv.toArray(new String[0])
|
||||||
}
|
}
|
||||||
|
|
||||||
String[] search(List<String> terms) {
|
String[] search(List<String> terms) {
|
||||||
Set<String> rv = null;
|
Set<String> rv = null;
|
||||||
|
|
||||||
|
Set<String> powerSet = new HashSet<>()
|
||||||
terms.each {
|
terms.each {
|
||||||
|
powerSet.addAll(it.toLowerCase().split(' '))
|
||||||
|
}
|
||||||
|
|
||||||
|
powerSet.each {
|
||||||
Set<String> forWord = keywords.getOrDefault(it,[])
|
Set<String> forWord = keywords.getOrDefault(it,[])
|
||||||
if (rv == null) {
|
if (rv == null) {
|
||||||
rv = new HashSet<>(forWord)
|
rv = new HashSet<>(forWord)
|
||||||
} else {
|
} else {
|
||||||
rv.retainAll(forWord)
|
rv.retainAll(forWord)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now, filter by terms
|
||||||
|
for (Iterator<String> iter = rv.iterator(); iter.hasNext();) {
|
||||||
|
String candidate = iter.next()
|
||||||
|
candidate = candidate.toLowerCase()
|
||||||
|
boolean keep = true
|
||||||
|
terms.each {
|
||||||
|
keep &= candidate.contains(it)
|
||||||
|
}
|
||||||
|
if (!keep)
|
||||||
|
iter.remove()
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rv != null)
|
if (rv != null)
|
||||||
|
|
|
@ -90,4 +90,22 @@ class SearchIndexTest {
|
||||||
def found = index.search(["muwire", "0", "3", "jar"])
|
def found = index.search(["muwire", "0", "3", "jar"])
|
||||||
assert found.size() == 1
|
assert found.size() == 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testOriginalText() {
|
||||||
|
initIndex(["a-b c-d"])
|
||||||
|
def found = index.search(['a-b'])
|
||||||
|
assert found.size() == 1
|
||||||
|
found = index.search(['c-d'])
|
||||||
|
assert found.size() == 1
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testPhrase() {
|
||||||
|
initIndex(["a-b c-d e-f"])
|
||||||
|
def found = index.search(['a-b c-d'])
|
||||||
|
assert found.size() == 1
|
||||||
|
assert index.search(['c-d e-f']).size() == 1
|
||||||
|
assert index.search(['a-b e-f']).size() == 0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue