From 8dcba7535c7a75517d29a9a7e47d7066eeec3149 Mon Sep 17 00:00:00 2001 From: Zlatin Balevsky Date: Tue, 5 Nov 2019 13:24:22 +0000 Subject: [PATCH] modify indexing and search logic to account for phrases --- .../com/muwire/core/search/SearchIndex.groovy | 31 ++++++++++++++++--- .../muwire/core/search/SearchIndexTest.groovy | 18 +++++++++++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/core/src/main/groovy/com/muwire/core/search/SearchIndex.groovy b/core/src/main/groovy/com/muwire/core/search/SearchIndex.groovy index 86b5b5fd..3e87a762 100644 --- a/core/src/main/groovy/com/muwire/core/search/SearchIndex.groovy +++ b/core/src/main/groovy/com/muwire/core/search/SearchIndex.groovy @@ -31,25 +31,48 @@ class SearchIndex { } } - private static String[] split(String source) { - source = source.replaceAll(SplitPattern.SPLIT_PATTERN, " ").toLowerCase() - String [] split = source.split(" ") + private static String[] split(final String source) { + // first split by split pattern + String sourceSplit = source.replaceAll(SplitPattern.SPLIT_PATTERN, " ").toLowerCase() + String [] split = sourceSplit.split(" ") def rv = [] split.each { if (it.length() > 0) rv << it } + + // then just by ' ' + source.split(' ').each { if (it.length() > 0) rv << it } + + // and add original string + rv << source rv.toArray(new String[0]) } String[] search(List terms) { Set rv = null; + Set powerSet = new HashSet<>() terms.each { + powerSet.addAll(it.toLowerCase().split(' ')) + } + + powerSet.each { Set forWord = keywords.getOrDefault(it,[]) if (rv == null) { rv = new HashSet<>(forWord) } else { rv.retainAll(forWord) } - + } + + // now, filter by terms + for (Iterator iter = rv.iterator(); iter.hasNext();) { + String candidate = iter.next() + candidate = candidate.toLowerCase() + boolean keep = true + terms.each { + keep &= candidate.contains(it) + } + if (!keep) + iter.remove() } if (rv != null) diff --git a/core/src/test/groovy/com/muwire/core/search/SearchIndexTest.groovy b/core/src/test/groovy/com/muwire/core/search/SearchIndexTest.groovy index 03264808..6cae8645 100644 --- a/core/src/test/groovy/com/muwire/core/search/SearchIndexTest.groovy +++ b/core/src/test/groovy/com/muwire/core/search/SearchIndexTest.groovy @@ -90,4 +90,22 @@ class SearchIndexTest { def found = index.search(["muwire", "0", "3", "jar"]) assert found.size() == 1 } + + @Test + void testOriginalText() { + initIndex(["a-b c-d"]) + def found = index.search(['a-b']) + assert found.size() == 1 + found = index.search(['c-d']) + assert found.size() == 1 + } + + @Test + void testPhrase() { + initIndex(["a-b c-d e-f"]) + def found = index.search(['a-b c-d']) + assert found.size() == 1 + assert index.search(['c-d e-f']).size() == 1 + assert index.search(['a-b e-f']).size() == 0 + } }