In the first post of this series, I explained how to produce an inverted index using C#. In the second post, I explained how to improve the tokenization process, using the Porter Stemming Algorithm. Now, I will share how to process boolean queries.
Inverted Index?
Within a document collection, I assume that each document has a unique serial number (a documentId). During the index construction I simple assign successive integer to each new document.
The input to the indexing process is an enumeration of normalized tokens for each document. The output is a dictionary where the keys are the tokens, and the values are lists of the documentIds of the documents which contains that token.
using System.Collections.Generic;
using System.Linq;
namespace Tupi.Indexing
{
public class InvertedIndex
{
private readonly ISet _indexedDocuments =
new SortedSet();
public IEnumerable<int> IndexedDocuments => _indexedDocuments;
private readonly IDictionary<string, List<int>> _data =
new Dictionary<string, List>();
internal void Append(string term, int documentId)
{
if (_data.ContainsKey(term))
{
_data[term].Add(documentId);
}
else
{
var postings = new List {documentId};
_data.Add(term, postings);
}
_indexedDocuments.Add(documentId);
}
public IEnumerable<int> GetPostingsFor(string term) => !_data.ContainsKey(term)
? Enumerable.Empty()
: _data[term];
}
}
The Query Abstraction
Given an inverted index, I want to be able to perform a query that results in all documents that meet the specified criteria.
In my implementation, the Query class is a base class and a factory.
using System.Collections.Generic;
using Tupi.Indexing;
namespace Tupi.Querying.Queries
{
public abstract class Query
{
public abstract IEnumerable<int> Execute(InvertedIndex index);
public static Query Term(string term) =>
TermQuery.From(term);
public static Query All(params string[] terms) =>
AllQuery.From(terms);
public static Query All(IEnumerable<int> queries) =>
new AllQuery(queries);
public static Query And(Query left, Query right) =>
new AllQuery(left, right);
public static Query Any(params string[] terms) =>
AnyQuery.From(terms);
public static Query Or(Query left, Query right) =>
new AnyQuery(left, right);
public static Query Not(Query innerQuery)
=> new NotQuery(innerQuery);
}
}
Use this abstraction to define queries is simple.
var query1 = Query.Or(
Query.Term("csharp"),
Query.And(
Query.Term("Elemar"),
Query.Term("inverted")
)
);
var query2 = Query.Not(
Query.And(
Query.Term("Elemar"),
Query.Term("inverted")
)
);
Querying terms
Retrieving the list of documents containing a term is pretty simple. We just need to normalize the term and then check the dictionary.
using System;
using System.Collections.Generic;
using System.IO;
using Tupi.Indexing;
namespace Tupi.Querying.Queries
{
public class TermQuery : Query
{
private readonly string _term;
public TermQuery(string term)
{
_term = term;
}
public override IEnumerable<int> Execute(InvertedIndex index)
=> index.GetPostingsFor(_term);
public static Query From(string term) =>
From(term, DefaultAnalyzer.Instance);
public static Query From(string term, IAnalyzer analyzer)
{
using (var reader = new StringReader(term))
{
var tokenSource = new TokenSource(reader);
tokenSource.Next();
if (!analyzer.Process(tokenSource))
{
throw new InvalidOperationException($"Could not generate a term from: {term}");
}
return new TermQuery(tokenSource.ToString());
}
}
}
}
AND operator
Then AND implementation just intersects the results of two (or more) queries.
using System.Collections.Generic;
using System.Linq;
using Tupi.Indexing;
namespace Tupi.Querying.Queries
{
public class AllQuery : Query
{
private readonly IEnumerable<Query> _queries;
public AllQuery(IEnumerable<Query> queries)
{
_queries = queries;
}
public AllQuery(params Query[] queries)
{
_queries = queries;
}
public override IEnumerable<int> Execute(InvertedIndex index)
{
var postings = new List<List>();
foreach (var query in _queries)
{
var l = query.Execute(index).ToList();
if (!l.Any())
{
return Enumerable.Empty();
}
postings.Add(l.ToList());
}
postings.Sort((l1, l2) => l1.Count.CompareTo(l2.Count));
var results = postings[0];
for (var i = 1; i < postings.Count; i++) { results = results.Intersect(postings[1]).ToList(); } return results; } public static Query From(params string[] terms) =>
new AllQuery(terms.Select(TermQuery.From).ToList());
}
}
OR operator
The OR implementation just merges the results of two (or more) queries.
using System.Collections.Generic;
using System.Linq;
using Tupi.Indexing;
namespace Tupi.Querying.Queries
{
public class AnyQuery : Query
{
private readonly IEnumerable<Query> _queries;
public AnyQuery(IEnumerable<Query> queries)
{
_queries = queries;
}
public AnyQuery(params Query[] queries)
{
_queries = queries;
}
public override IEnumerable<int> Execute(InvertedIndex index)
{
var results = Enumerable.Empty();
return _queries.Aggregate(
seed: Enumerable.Empty(),
func: (current, query) => current.Union(query.Execute(index))
);
}
public static Query From(params string[] terms) =>
new AnyQuery(terms.Select(TermQuery.From).ToList());
}
}
NOT operator
The NOT implementation returns all indexed documents, except the returned by the execution of the inner query.
using System.Collections.Generic;
using System.Linq;
using Tupi.Indexing;
namespace Tupi.Querying.Queries
{
public class NotQuery : Query
{
private readonly Query _query;
public NotQuery(Query query)
{
_query = query;
}
public override IEnumerable<int> Execute(InvertedIndex index) =>
index.IndexedDocuments.Except(_query.Execute(index));
}
}
Last words
In this post a shared how to implement boolean queries on an inverted index. I am pretty sure that there are a lot of improvement opportunities here. Please give me your recommendations in the comments.

