davidbgk
/
larlet-fr-david-cache


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
							<!doctype html><!-- This is a valid HTML5 document. -->
<!-- Screen readers, SEO, extensions and so on. -->
<html lang="fr">
<!-- Has to be within the first 1024 bytes, hence before the `title` element
  See: https://www.w3.org/TR/2012/CR-html5-20121217/document-metadata.html#charset -->
<meta charset="utf-8">
<!-- Why no `X-UA-Compatible` meta: https://stackoverflow.com/a/6771584 -->
<!-- The viewport meta is quite crowded and we are responsible for that.
  See: https://codepen.io/tigt/post/meta-viewport-for-2015 -->
<meta name="viewport" content="width=device-width,initial-scale=1">
<!-- Required to make a valid HTML5 document. -->
<title>Building a full-text search engine in 150 lines of Python code (archive) — David Larlet</title>
<meta name="description" content="Publication mise en cache pour en conserver une trace.">
<!-- That good ol' feed, subscribe :). -->
<link rel="alternate" type="application/atom+xml" title="Feed" href="/david/log/">
<!-- Generated from https://realfavicongenerator.net/ such a mess. -->
<link rel="apple-touch-icon" sizes="180x180" href="/static/david/icons2/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/static/david/icons2/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/static/david/icons2/favicon-16x16.png">
<link rel="manifest" href="/static/david/icons2/site.webmanifest">
<link rel="mask-icon" href="/static/david/icons2/safari-pinned-tab.svg" color="#07486c">
<link rel="shortcut icon" href="/static/david/icons2/favicon.ico">
<meta name="msapplication-TileColor" content="#f7f7f7">
<meta name="msapplication-config" content="/static/david/icons2/browserconfig.xml">
<meta name="theme-color" content="#f7f7f7" media="(prefers-color-scheme: light)">
<meta name="theme-color" content="#272727" media="(prefers-color-scheme: dark)">
<!-- Documented, feel free to shoot an email. -->
<link rel="stylesheet" href="/static/david/css/style_2021-01-20.css">
<!-- See https://www.zachleat.com/web/comprehensive-webfonts/ for the trade-off. -->
<link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_regular.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
<link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_bold.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
<link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_italic.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
<link rel="preload" href="/static/david/css/fonts/triplicate_t3_regular.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
<link rel="preload" href="/static/david/css/fonts/triplicate_t3_bold.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
<link rel="preload" href="/static/david/css/fonts/triplicate_t3_italic.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
<script>
    function toggleTheme(themeName) {
        document.documentElement.classList.toggle(
            'forced-dark',
            themeName === 'dark'
        )
        document.documentElement.classList.toggle(
            'forced-light',
            themeName === 'light'
        )
    }
    const selectedTheme = localStorage.getItem('theme')
    if (selectedTheme !== 'undefined') {
        toggleTheme(selectedTheme)
    }
</script>

  <meta name="robots" content="noindex, nofollow">
  <meta content="origin-when-cross-origin" name="referrer">
  <!-- Canonical URL for SEO purposes -->
  <link rel="canonical" href="https://bart.degoe.de/building-a-full-text-search-engine-150-lines-of-code/">

<body class="remarkdown h1-underline h2-underline h3-underline em-underscore hr-center ul-star pre-tick" data-instant-intensity="viewport-all">


<article>
  <header>
    <h1>Building a full-text search engine in 150 lines of Python code</h1>
  </header>
  <nav>
    <p class="center">
      <a href="/david/" title="Aller à l’accueil"><svg class="icon icon-home">
        <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-home"></use>
      </svg> Accueil</a> •
      <a href="https://bart.degoe.de/building-a-full-text-search-engine-150-lines-of-code/" title="Lien vers le contenu original">Source originale</a>
    </p>
  </nav>
  <hr>
  <p>Full-text search is everywhere. From finding a book on Scribd, a movie on Netflix, toilet paper on Amazon, or anything else on the web through Google (like <a href="https://localghost.dev/2019/09/everything-i-googled-in-a-week-as-a-professional-software-engineer/">how to do your job as a software engineer</a>), you’ve searched vast amounts of unstructured data multiple times today. What’s even more amazing, is that you’ve even though you searched millions (or <a href="https://www.worldwidewebsize.com/">billions</a>) of records, you got a response in milliseconds. In this post, we are going to explore the basic components of a full-text search engine, and use them to build one that can search across millions of documents and rank them according to their relevance in milliseconds, in less than 150 lines of Python code!</p>
<div id="player">
<p class="listen">Listen to this article instead</p>

<audio controls class="audio_controls " preload="metadata">
<source src="https://bart.degoe.de/audio/2021-03-24-python-full-text-search-engine.mp3" type="audio/mp3">
Your browser does not support the audio element
</source></audio>
</div>
<h1 id="data">Data</h1>
<p>All the code you in this blog post can be found on <a href="https://github.com/bartdegoede/python-searchengine/">Github</a>. I’ll provide links with the code snippets here, so you can try running this yourself. You can run the full example by installing <a href="https://github.com/bartdegoede/python-searchengine/blob/master/requirements.txt">the requirements</a> (<code>pip install -r requirements.txt</code>) and <a href="https://github.com/bartdegoede/python-searchengine/blob/master/run.py">run <code>python run.py</code></a>. This will download all the data and execute the example query with and without rankings.</p>
<p>Before we’re jumping into building a search engine, we first need some full-text, unstructured data to search. We are going to be searching abstracts of articles from the English Wikipedia, which is currently a gzipped XML file of about 785mb and contains about 6.27 million abstracts<sup id="fnref:1"></sup>. I’ve written <a href="https://github.com/bartdegoede/python-searchengine/blob/master/download.py">a simple function to download</a> the gzipped XML, but you can also just manually download the file.</p>
<h2 id="data-preparation">Data preparation</h2>
<p>The file is one large XML file that contains all abstracts. One abstract in this file is contained by a <code>&lt;doc&gt;</code> element, and looks roughly like this (I’ve omitted elements we’re not interested in):</p>
<div class="highlight"><pre><code class="language-xml" data-lang="xml"><span>&lt;doc&gt;</span>
    <span>&lt;title&gt;</span>Wikipedia: London Beer Flood<span>&lt;/title&gt;</span>
    <span>&lt;url&gt;</span>https://en.wikipedia.org/wiki/London_Beer_Flood<span>&lt;/url&gt;</span>
    <span>&lt;abstract&gt;</span>The London Beer Flood was an accident at Meux <span>&amp;</span> Co's Horse Shoe Brewery, London, on 17 October 1814. It took place when one of the  wooden vats of fermenting porter burst.<span>&lt;/abstract&gt;</span>
    ...
<span>&lt;/doc&gt;</span>
</code></pre></div>
<p>The bits were interested in are the <code>title</code>, the <code>url</code> and the <code>abstract</code> text itself. We’ll represent documents with a <a href="https://realpython.com/python-data-classes/">Python dataclass</a> for convenient data access. We’ll add a property that concatenates the title and the contents of the abstract. You can find the code <a href="https://github.com/bartdegoede/python-searchengine/blob/master/search/documents.py">here</a>.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>from</span> dataclasses <span>import</span> dataclass

<span>@dataclass</span>
<span>class</span> <span>Abstract</span>:
    <span>"""Wikipedia abstract"""</span>
    ID: int
    title: str
    abstract: str
    url: str

    <span>@property</span>
    <span>def</span> <span>fulltext</span>(self):
        <span>return</span> <span>' '</span><span>.</span>join([self<span>.</span>title, self<span>.</span>abstract])
</code></pre></div>
<p>Then, we’ll want to extract the abstracts data from the XML and parse it so we can create instances of our <code>Abstract</code> object. We are going to stream through the gzipped XML without loading the entire file into memory first<sup id="fnref:2"></sup>. We’ll assign each document an ID in order of loading (ie the first document will have ID=1, the second one will have ID=2, etcetera). You can find the code <a href="https://github.com/bartdegoede/python-searchengine/blob/master/load.py">here</a>.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>import</span> gzip
<span>from</span> lxml <span>import</span> etree

<span>from</span> search.documents <span>import</span> Abstract

<span>def</span> <span>load_documents</span>():
    <span># open a filehandle to the gzipped Wikipedia dump</span>
    <span>with</span> gzip<span>.</span>open(<span>'data/enwiki.latest-abstract.xml.gz'</span>, <span>'rb'</span>) <span>as</span> f:
        doc_id <span>=</span> <span>1</span>
        <span># iterparse will yield the entire `doc` element once it finds the</span>
        <span># closing `&lt;/doc&gt;` tag</span>
        <span>for</span> _, element <span>in</span> etree<span>.</span>iterparse(f, events<span>=</span>(<span>'end'</span>,), tag<span>=</span><span>'doc'</span>):
            title <span>=</span> element<span>.</span>findtext(<span>'./title'</span>)
            url <span>=</span> element<span>.</span>findtext(<span>'./url'</span>)
            abstract <span>=</span> element<span>.</span>findtext(<span>'./abstract'</span>)

            <span>yield</span> Abstract(ID<span>=</span>doc_id, title<span>=</span>title, url<span>=</span>url, abstract<span>=</span>abstract)

            doc_id <span>+=</span> <span>1</span>
            <span># the `element.clear()` call will explicitly free up the memory</span>
            <span># used to store the element</span>
            element<span>.</span>clear()
</code></pre></div>
<h1 id="indexing">Indexing</h1>
<p>We are going to store this in a data structure known as an <a href="https://en.wikipedia.org/wiki/Inverted_index">“inverted index” or a “postings list”</a>. Think of it as the index in the back of a book that has an alphabetized list of relevant words and concepts, and on what page number a reader can find them.</p>
<figure>
<img src="https://bart.degoe.de/img/2021-03-24-building-a-full-text-search-engine-150-lines-of-code/book-index-1080x675.png"> <figcaption>
<h4>Back of the book index</h4>
</figcaption>
</figure>
<p>Practically, what this means is that we’re going to create a dictionary where we map all the words in our corpus to the IDs of the documents they occur in. That will look something like this:</p>
<div class="highlight"><pre><code class="language-json" data-lang="json">{
    <span>...</span>
    <span>"london"</span>: [<span>5245250</span>, <span>2623812</span>, <span>133455</span>, <span>3672401</span>, <span>...</span>],
    <span>"beer"</span>: [<span>1921376</span>, <span>4411744</span>, <span>684389</span>, <span>2019685</span>, <span>...</span>],
    <span>"flood"</span>: [<span>3772355</span>, <span>2895814</span>, <span>3461065</span>, <span>5132238</span>, <span>...</span>],
    <span>...</span>
}
</code></pre></div>
<p>Note that in the example above the words in the dictionary are lowercased; before building the index we are going to break down or <code>analyze</code> the raw text into a list of words or <code>tokens</code>. The idea is that we first break up or <code>tokenize</code> the text into words, and then apply zero or more <code>filters</code> (such as lowercasing or stemming) on each token to improve the odds of matching queries to text.</p>
<figure>
<img src="https://bart.degoe.de/img/2021-03-24-building-a-full-text-search-engine-150-lines-of-code/tokenization.png"> <figcaption>
<h4>Tokenization</h4>
</figcaption>
</figure>
<h2 id="analysis">Analysis</h2>
<p>We are going to apply very simple tokenization, by just splitting the text on whitespace. Then, we are going to apply a couple of filters on each of the tokens: we are going to lowercase each token, remove any punctuation, remove the 25 most common words in the English language (and the word “wikipedia” because it occurs in every title in every abstract) and apply <a href="https://en.wikipedia.org/wiki/Stemming">stemming</a> to every word (ensuring that different forms of a word map to the same stem, like <em>brewery</em> and <em>breweries</em><sup id="fnref:3"></sup>).</p>
<p>The tokenization and lowercase filter are very simple:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>import</span> Stemmer

STEMMER <span>=</span> Stemmer<span>.</span>Stemmer(<span>'english'</span>)

<span>def</span> <span>tokenize</span>(text):
    <span>return</span> text<span>.</span>split()

<span>def</span> <span>lowercase_filter</span>(tokens):
    <span>return</span> [token<span>.</span>lower() <span>for</span> token <span>in</span> tokens]

<span>def</span> <span>stem_filter</span>(tokens):
    <span>return</span> STEMMER<span>.</span>stemWords(tokens)
</code></pre></div>
<p>Punctuation is nothing more than a regular expression on the set of punctuation:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>import</span> re
<span>import</span> string

PUNCTUATION <span>=</span> re<span>.</span>compile(<span>'[</span><span>%s</span><span>]'</span> <span>%</span> re<span>.</span>escape(string<span>.</span>punctuation))

<span>def</span> <span>punctuation_filter</span>(tokens):
    <span>return</span> [PUNCTUATION<span>.</span>sub(<span>''</span>, token) <span>for</span> token <span>in</span> tokens]
</code></pre></div>
<p>Stopwords are words that are very common and we would expect to occcur in (almost) every document in the corpus. As such, they won’t contribute much when we search for them (i.e. (almost) every document will match when we search for those terms) and will just take up space, so we will filter them out at index time. The Wikipedia abstract corpus includes the word “Wikipedia” in every title, so we’ll add that word to the stopword list as well. We drop the 25 most common words in English.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span># top 25 most common words in English and "wikipedia":</span>
<span># https://en.wikipedia.org/wiki/Most_common_words_in_English</span>
STOPWORDS <span>=</span> set([<span>'the'</span>, <span>'be'</span>, <span>'to'</span>, <span>'of'</span>, <span>'and'</span>, <span>'a'</span>, <span>'in'</span>, <span>'that'</span>, <span>'have'</span>,
                 <span>'I'</span>, <span>'it'</span>, <span>'for'</span>, <span>'not'</span>, <span>'on'</span>, <span>'with'</span>, <span>'he'</span>, <span>'as'</span>, <span>'you'</span>,
                 <span>'do'</span>, <span>'at'</span>, <span>'this'</span>, <span>'but'</span>, <span>'his'</span>, <span>'by'</span>, <span>'from'</span>, <span>'wikipedia'</span>])

<span>def</span> <span>stopword_filter</span>(tokens):
    <span>return</span> [token <span>for</span> token <span>in</span> tokens <span>if</span> token <span>not</span> <span>in</span> STOPWORDS]
</code></pre></div>
<p>Bringing all these filters together, we’ll <a href="https://github.com/bartdegoede/python-searchengine/blob/master/search/analysis.py#L28-L35">construct an <code>analyze</code> function</a> that will operate on the <code>text</code> in each abstract; it will tokenize the text into individual words (or rather, <em>tokens</em>), and then apply each filter in succession to the list of tokens. The order is important, because we use a non-stemmed list of stopwords, so we should apply the <code>stopword_filter</code> before the <code>stem_filter</code>.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>def</span> <span>analyze</span>(text):
    tokens <span>=</span> tokenize(text)
    tokens <span>=</span> lowercase_filter(tokens)
    tokens <span>=</span> punctuation_filter(tokens)
    tokens <span>=</span> stopword_filter(tokens)
    tokens <span>=</span> stem_filter(tokens)

    <span>return</span> [token <span>for</span> token <span>in</span> tokens <span>if</span> token]
</code></pre></div>
<h2 id="indexing-the-corpus">Indexing the corpus</h2>
<p>We’ll create an <code>Index</code> class that will store the <code>index</code> and the <code>documents</code>. The <code>documents</code> dictionary stores the dataclasses by ID, and the <code>index</code> keys will be the tokens, with the values being the document IDs the token occurs in:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>class</span> <span>Index</span>:
    <span>def</span> __init__(self):
        self<span>.</span>index <span>=</span> {}
        self<span>.</span>documents <span>=</span> {}

    <span>def</span> <span>index_document</span>(self, document):
        <span>if</span> document<span>.</span>ID <span>not</span> <span>in</span> self<span>.</span>documents:
            self<span>.</span>documents[document<span>.</span>ID] <span>=</span> document

        <span>for</span> token <span>in</span> analyze(document<span>.</span>fulltext):
            <span>if</span> token <span>not</span> <span>in</span> self<span>.</span>index:
                self<span>.</span>index[token] <span>=</span> set()
            self<span>.</span>index[token]<span>.</span>add(document<span>.</span>ID)
</code></pre></div>
<h1 id="searching">Searching</h1>
<p>Now we have all tokens indexed, searching for a query becomes a matter of analyzing the query text with the same analyzer as we applied to the documents; this way we’ll end up with tokens that should match the tokens we have in the index. For each token, we’ll do a lookup in the dictionary, finding the document IDs that the token occurs in. We do this for every token, and then find the IDs of documents in all these sets (i.e. for a document to match the query, it needs to contain all the tokens in the query). We will then take the resulting list of document IDs, and fetch the actual data from our <code>documents</code> store<sup id="fnref:4"></sup>.</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>def</span> <span>_results</span>(self, analyzed_query):
    <span>return</span> [self<span>.</span>index<span>.</span>get(token, set()) <span>for</span> token <span>in</span> analyzed_query]

<span>def</span> <span>search</span>(self, query):
    <span>"""
</span><span>    Boolean search; this will return documents that contain all words from the
</span><span>    query, but not rank them (sets are fast, but unordered).
</span><span>    """</span>
    analyzed_query <span>=</span> analyze(query)
    results <span>=</span> self<span>.</span>_results(analyzed_query)
    documents <span>=</span> [self<span>.</span>documents[doc_id] <span>for</span> doc_id <span>in</span> set<span>.</span>intersection(<span>*</span>results)]

    <span>return</span> documents


In [<span>1</span>]: index<span>.</span>search(<span>'London Beer Flood'</span>)
search took <span>0.16307830810546875</span> milliseconds
Out[<span>1</span>]:
[Abstract(ID<span>=</span><span>1501027</span>, title<span>=</span><span>'Wikipedia: Horse Shoe Brewery'</span>, abstract<span>=</span><span>'The Horse Shoe Brewery was an English brewery in the City of Westminster that was established in 1764 and became a major producer of porter, from 1809 as Henry Meux &amp; Co. It was the site of the London Beer Flood in 1814, which killed eight people after a porter vat burst.'</span>, url<span>=</span><span>'https://en.wikipedia.org/wiki/Horse_Shoe_Brewery'</span>),
 Abstract(ID<span>=</span><span>1828015</span>, title<span>=</span><span>'Wikipedia: London Beer Flood'</span>, abstract<span>=</span><span>"The London Beer Flood was an accident at Meux &amp; Co's Horse Shoe Brewery, London, on 17 October 1814. It took place when one of the  wooden vats of fermenting porter burst."</span>, url<span>=</span><span>'https://en.wikipedia.org/wiki/London_Beer_Flood'</span>)]
</code></pre></div>
<p>Now, this will make our queries very precise, especially for long query strings (the more tokens our query contains, the less likely it’ll be that there will be a document that has all of these tokens). We could optimize our search function for <a href="https://en.wikipedia.org/wiki/Precision_and_recall">recall rather than precision</a> by allowing users to specify that only one occurrence of a token is enough to match our query:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>def</span> <span>search</span>(self, query, search_type<span>=</span><span>'AND'</span>):
    <span>"""
</span><span>    Still boolean search; this will return documents that contain either all words
</span><span>    from the query or just one of them, depending on the search_type specified.
</span><span>
</span><span>    We are still not ranking the results (sets are fast, but unordered).
</span><span>    """</span>
    <span>if</span> search_type <span>not</span> <span>in</span> (<span>'AND'</span>, <span>'OR'</span>):
        <span>return</span> []

    analyzed_query <span>=</span> analyze(query)
    results <span>=</span> self<span>.</span>_results(analyzed_query)
    <span>if</span> search_type <span>==</span> <span>'AND'</span>:
        <span># all tokens must be in the document</span>
        documents <span>=</span> [self<span>.</span>documents[doc_id] <span>for</span> doc_id <span>in</span> set<span>.</span>intersection(<span>*</span>results)]
    <span>if</span> search_type <span>==</span> <span>'OR'</span>:
        <span># only one token has to be in the document</span>
        documents <span>=</span> [self<span>.</span>documents[doc_id] <span>for</span> doc_id <span>in</span> set<span>.</span>union(<span>*</span>results)]

    <span>return</span> documents


In [<span>2</span>]: index<span>.</span>search(<span>'London Beer Flood'</span>, search_type<span>=</span><span>'OR'</span>)
search took <span>0.02816295623779297</span> seconds
Out[<span>2</span>]:
[Abstract(ID<span>=</span><span>5505026</span>, title<span>=</span><span>'Wikipedia: Addie Pryor'</span>, abstract<span>=</span><span>'| birth_place    = London, England'</span>, url<span>=</span><span>'https://en.wikipedia.org/wiki/Addie_Pryor'</span>),
 Abstract(ID<span>=</span><span>1572868</span>, title<span>=</span><span>'Wikipedia: Tim Steward'</span>, abstract<span>=</span><span>'|birth_place         = London, United Kingdom'</span>, url<span>=</span><span>'https://en.wikipedia.org/wiki/Tim_Steward'</span>),
 Abstract(ID<span>=</span><span>5111814</span>, title<span>=</span><span>'Wikipedia: 1877 Birthday Honours'</span>, abstract<span>=</span><span>'The 1877 Birthday Honours were appointments by Queen Victoria to various orders and honours to reward and highlight good works by citizens of the British Empire. The appointments were made to celebrate the official birthday of the Queen, and were published in The London Gazette on 30 May and 2 June 1877.'</span>, url<span>=</span><span>'https://en.wikipedia.org/wiki/1877_Birthday_Honours'</span>),
 <span>...</span>
In [<span>3</span>]: len(index<span>.</span>search(<span>'London Beer Flood'</span>, search_type<span>=</span><span>'OR'</span>))
search took <span>0.029065370559692383</span> seconds
Out[<span>3</span>]: <span>49627</span>
</code></pre></div>
<h1 id="relevancy">Relevancy</h1>
<p>We have implemented a pretty quick search engine with just some basic Python, but there’s one aspect that’s obviously missing from our little engine, and that’s the <a href="https://livebook.manning.com/book/relevant-search/chapter-1/13">idea of <strong>relevance</strong></a>. Right now we just return an unordered list of documents, and we leave it up to the user to figure out which of those (s)he is actually interested in. Especially for large result sets, that is painful or just impossible (in our <code>OR</code> example, there are almost 50,000 results).</p>
<p>This is where the idea of relevancy comes in; what if we could assign each document a score that would indicate how well it matches the query, and just order by that score? A naive and simple way of assigning a score to a document for a given query is to just count how often that document mentions that particular word. After all, the more that document mentions that term, the more likely it is that it is about our query!</p>
<h2 id="term-frequency">Term frequency</h2>
<p>Let’s expand our <code>Abstract</code> dataclass to compute and store it’s term frequencies when we index it. That way, we’ll have easy access to those numbers when we want to rank our unordered list of documents:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span># in documents.py</span>
<span>from</span> collections <span>import</span> Counter
<span>from</span> .analysis <span>import</span> analyze

<span>@dataclass</span>
<span>class</span> <span>Abstract</span>:
    <span># snip</span>
    <span>def</span> <span>analyze</span>(self):
        <span># Counter will create a dictionary counting the unique values in an array:</span>
        <span># {'london': 12, 'beer': 3, ...}</span>
        self<span>.</span>term_frequencies <span>=</span> Counter(analyze(self<span>.</span>fulltext))

    <span>def</span> <span>term_frequency</span>(self, term):
        <span>return</span> self<span>.</span>term_frequencies<span>.</span>get(term, <span>0</span>)
</code></pre></div>
<p>We need to make sure to generate these frequency counts when we index our data:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span># in index.py we add `document.analyze()</span>

<span>def</span> <span>index_document</span>(self, document):
    <span>if</span> document<span>.</span>ID <span>not</span> <span>in</span> self<span>.</span>documents:
        self<span>.</span>documents[document<span>.</span>ID] <span>=</span> document
        document<span>.</span>analyze()
</code></pre></div>
<p>We’ll modify our search function so we can apply a ranking to the documents in our result set. We’ll fetch the documents using the same Boolean query from the index and document store, and then we’ll for every document in that result set, we’ll simply sum up how often each term occurs in that document</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span>def</span> <span>search</span>(self, query, search_type<span>=</span><span>'AND'</span>, rank<span>=</span>True):
    <span># snip</span>
    <span>if</span> rank:
        <span>return</span> self<span>.</span>rank(analyzed_query, documents)
    <span>return</span> documents


<span>def</span> <span>rank</span>(self, analyzed_query, documents):
    results <span>=</span> []
    <span>if</span> <span>not</span> documents:
        <span>return</span> results
    <span>for</span> document <span>in</span> documents:
        score <span>=</span> sum([document<span>.</span>term_frequency(token) <span>for</span> token <span>in</span> analyzed_query])
        results<span>.</span>append((document, score))
    <span>return</span> sorted(results, key<span>=</span><span>lambda</span> doc: doc[<span>1</span>], reverse<span>=</span>True)
</code></pre></div>
<h2 id="inverse-document-frequency">Inverse Document Frequency</h2>
<p>That’s already a lot better, but there are some obvious short-comings. We’re considering all query terms to be of equivalent value when assessing the relevancy for the query. However, it’s likely that certain terms have very little to no discriminating power when determining relevancy; for example, a collection with lots of documents about beer would be expected to have the term “beer” appear often in almost every document (in fact, we’re already trying to address that by dropping the 25 most common English words from the index). Searching for the word “beer” in such a case would essentially do another random sort.</p>
<p>In order to address that, we’ll add another component to our scoring algorithm that will reduce the contribution of terms that occur very often in the index to the final score. We could use the <em>collection frequency</em> of a term (i.e. how often does this term occur across <em>all</em> documents), but <a href="https://nlp.stanford.edu/IR-book/html/htmledition/inverse-document-frequency-1.html">in practice</a> the <em>document frequency</em> is used instead (i.e. how many <em>documents</em> in the index contain this term). We’re trying to rank documents after all, so it makes sense to have a document level statistic.</p>
<p>We’ll compute the <em>inverse document frequency</em> for a term by dividing the number of documents (<em>N</em>) in the index by the amount of documents that contain the term, and take a logarithm of that.</p>
<figure>
<img src="https://bart.degoe.de/img/2021-03-24-building-a-full-text-search-engine-150-lines-of-code/idf.jpg"> <figcaption>
<h4>IDF; taken from https://moz.com/blog/inverse-document-frequency-and-the-importance-of-uniqueness</h4>
</figcaption>
</figure>
<p>We’ll then simply multiple the term frequency with the inverse document frequency during our ranking, so matches on terms that are rare in the corpus will contribute more to the relevancy score<sup id="fnref:5"></sup>. We can easily compute the inverse document frequency from the data available in our index:</p>
<div class="highlight"><pre><code class="language-python" data-lang="python"><span># index.py</span>
<span>import</span> math

<span>def</span> <span>document_frequency</span>(self, token):
    <span>return</span> len(self<span>.</span>index<span>.</span>get(token, set()))

<span>def</span> <span>inverse_document_frequency</span>(self, token):
    <span># Manning, Hinrich and Schütze use log10, so we do too, even though it</span>
    <span># doesn't really matter which log we use anyway</span>
    <span># https://nlp.stanford.edu/IR-book/html/htmledition/inverse-document-frequency-1.html</span>
    <span>return</span> math<span>.</span>log10(len(self<span>.</span>documents) <span>/</span> self<span>.</span>document_frequency(token))

<span>def</span> <span>rank</span>(self, analyzed_query, documents):
    results <span>=</span> []
    <span>if</span> <span>not</span> documents:
        <span>return</span> results
    <span>for</span> document <span>in</span> documents:
        score <span>=</span> <span>0.0</span>
        <span>for</span> token <span>in</span> analyzed_query:
            tf <span>=</span> document<span>.</span>term_frequency(token)
            idf <span>=</span> self<span>.</span>inverse_document_frequency(token)
            score <span>+=</span> tf <span>*</span> idf
        results<span>.</span>append((document, score))
    <span>return</span> sorted(results, key<span>=</span><span>lambda</span> doc: doc[<span>1</span>], reverse<span>=</span>True)
</code></pre></div>
<h1 id="future-work">Future Work™</h1>
<p>And that’s a basic search engine in just a few lines of Python code! You can find all the code on <a href="https://github.com/bartdegoede/python-searchengine">Github</a>, and I’ve provided a utility function that will download the Wikipedia abstracts and build an index. Install the requirements, run it in your Python console of choice and have fun messing with the data structures and searching.</p>
<p>Now, obviously this is a project to illustrate the concepts of search and how it can be so fast (even with ranking, I can search and rank 6.27m documents on my laptop with a “slow” language like Python) and not production grade software. It runs entirely in memory on my laptop, whereas libraries like Lucene utilize hyper-efficient data structures and even optimize disk seeks, and software like Elasticsearch and Solr scale Lucene to hundreds if not thousands of machines.</p>
<p>That doesn’t mean that we can’t think about fun expansions on this basic functionality though; for example, we assume that every field in the document has the same contribution to relevancy, whereas a query term match in the title should probably be weighted more strongly than a match in the description. Another fun project could be to expand the query parsing; there’s no reason why either all or just one term need to match. Why not exclude certain terms, or do <code>AND</code> and <code>OR</code> between individual terms? Can we persist the index to disk and make it scale beyond the confines of my laptop RAM?</p>
</article>


<hr>

<footer>
  <p>
    <a href="/david/" title="Aller à l’accueil"><svg class="icon icon-home">
        <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-home"></use>
      </svg> Accueil</a> •
    <a href="/david/log/" title="Accès au flux RSS"><svg class="icon icon-rss2">
        <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-rss2"></use>
      </svg> Suivre</a> •
    <a href="http://larlet.com" title="Go to my English profile" data-instant><svg class="icon icon-user-tie">
        <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-user-tie"></use>
      </svg> Pro</a> •
    <a href="mailto:david%40larlet.fr" title="Envoyer un courriel"><svg class="icon icon-mail">
        <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-mail"></use>
      </svg> Email</a> •
    <abbr class="nowrap" title="Hébergeur : Alwaysdata, 62 rue Tiquetonne 75002 Paris, +33184162340"><svg class="icon icon-hammer2">
        <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-hammer2"></use>
      </svg> Légal</abbr>
  </p>
  <template id="theme-selector">
      <form>
          <fieldset>
              <legend><svg class="icon icon-brightness-contrast">
                <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-brightness-contrast"></use>
              </svg> Thème</legend>
              <label>
                  <input type="radio" value="auto" name="chosen-color-scheme" checked> Auto
              </label>
              <label>
                  <input type="radio" value="dark" name="chosen-color-scheme"> Foncé
              </label>
              <label>
                  <input type="radio" value="light" name="chosen-color-scheme"> Clair
              </label>
          </fieldset>
      </form>
  </template>
</footer>
<script src="/static/david/js/instantpage-5.1.0.min.js" type="module"></script>
<script>
    function loadThemeForm(templateName) {
        const themeSelectorTemplate = document.querySelector(templateName)
        const form = themeSelectorTemplate.content.firstElementChild
        themeSelectorTemplate.replaceWith(form)

        form.addEventListener('change', (e) => {
            const chosenColorScheme = e.target.value
            localStorage.setItem('theme', chosenColorScheme)
            toggleTheme(chosenColorScheme)
        })

        const selectedTheme = localStorage.getItem('theme')
        if (selectedTheme && selectedTheme !== 'undefined') {
            form.querySelector(`[value="${selectedTheme}"]`).checked = true
        }
    }

    const prefersColorSchemeDark = '(prefers-color-scheme: dark)'
    window.addEventListener('load', () => {
        let hasDarkRules = false
        for (const styleSheet of Array.from(document.styleSheets)) {
            let mediaRules = []
            for (const cssRule of styleSheet.cssRules) {
                if (cssRule.type !== CSSRule.MEDIA_RULE) {
                    continue
                }
                // WARNING: Safari does not have/supports `conditionText`.
                if (cssRule.conditionText) {
                    if (cssRule.conditionText !== prefersColorSchemeDark) {
                        continue
                    }
                } else {
                    if (cssRule.cssText.startsWith(prefersColorSchemeDark)) {
                        continue
                    }
                }
                mediaRules = mediaRules.concat(Array.from(cssRule.cssRules))
            }

            // WARNING: do not try to insert a Rule to a styleSheet you are
            // currently iterating on, otherwise the browser will be stuck
            // in a infinite loop…
            for (const mediaRule of mediaRules) {
                styleSheet.insertRule(mediaRule.cssText)
                hasDarkRules = true
            }
        }
        if (hasDarkRules) {
            loadThemeForm('#theme-selector')
        }
    })
</script>
</body>
</html>