123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- <!doctype html><!-- This is a valid HTML5 document. -->
- <!-- Screen readers, SEO, extensions and so on. -->
- <html lang="fr">
- <!-- Has to be within the first 1024 bytes, hence before the `title` element
- See: https://www.w3.org/TR/2012/CR-html5-20121217/document-metadata.html#charset -->
- <meta charset="utf-8">
- <!-- Why no `X-UA-Compatible` meta: https://stackoverflow.com/a/6771584 -->
- <!-- The viewport meta is quite crowded and we are responsible for that.
- See: https://codepen.io/tigt/post/meta-viewport-for-2015 -->
- <meta name="viewport" content="width=device-width,initial-scale=1">
- <!-- Required to make a valid HTML5 document. -->
- <title>Host your own wikipedia backup (archive) — David Larlet</title>
- <meta name="description" content="Publication mise en cache pour en conserver une trace.">
- <!-- That good ol' feed, subscribe :). -->
- <link rel="alternate" type="application/atom+xml" title="Feed" href="/david/log/">
- <!-- Generated from https://realfavicongenerator.net/ such a mess. -->
- <link rel="apple-touch-icon" sizes="180x180" href="/static/david/icons2/apple-touch-icon.png">
- <link rel="icon" type="image/png" sizes="32x32" href="/static/david/icons2/favicon-32x32.png">
- <link rel="icon" type="image/png" sizes="16x16" href="/static/david/icons2/favicon-16x16.png">
- <link rel="manifest" href="/static/david/icons2/site.webmanifest">
- <link rel="mask-icon" href="/static/david/icons2/safari-pinned-tab.svg" color="#07486c">
- <link rel="shortcut icon" href="/static/david/icons2/favicon.ico">
- <meta name="msapplication-TileColor" content="#f7f7f7">
- <meta name="msapplication-config" content="/static/david/icons2/browserconfig.xml">
- <meta name="theme-color" content="#f7f7f7" media="(prefers-color-scheme: light)">
- <meta name="theme-color" content="#272727" media="(prefers-color-scheme: dark)">
- <!-- Documented, feel free to shoot an email. -->
- <link rel="stylesheet" href="/static/david/css/style_2021-01-20.css">
- <!-- See https://www.zachleat.com/web/comprehensive-webfonts/ for the trade-off. -->
- <link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_regular.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
- <link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_bold.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
- <link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_italic.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
- <link rel="preload" href="/static/david/css/fonts/triplicate_t3_regular.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
- <link rel="preload" href="/static/david/css/fonts/triplicate_t3_bold.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
- <link rel="preload" href="/static/david/css/fonts/triplicate_t3_italic.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
- <script>
- function toggleTheme(themeName) {
- document.documentElement.classList.toggle(
- 'forced-dark',
- themeName === 'dark'
- )
- document.documentElement.classList.toggle(
- 'forced-light',
- themeName === 'light'
- )
- }
- const selectedTheme = localStorage.getItem('theme')
- if (selectedTheme !== 'undefined') {
- toggleTheme(selectedTheme)
- }
- </script>
-
- <meta name="robots" content="noindex, nofollow">
- <meta content="origin-when-cross-origin" name="referrer">
- <!-- Canonical URL for SEO purposes -->
- <link rel="canonical" href="https://dataswamp.org/~solene/2019-11-13-wikimedia-dump.html">
-
- <body class="remarkdown h1-underline h2-underline h3-underline em-underscore hr-center ul-star pre-tick" data-instant-intensity="viewport-all">
-
-
- <article>
- <header>
- <h1>Host your own wikipedia backup</h1>
- </header>
- <nav>
- <p class="center">
- <a href="/david/" title="Aller à l’accueil"><svg class="icon icon-home">
- <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-home"></use>
- </svg> Accueil</a> •
- <a href="https://dataswamp.org/~solene/2019-11-13-wikimedia-dump.html" title="Lien vers le contenu original">Source originale</a>
- </p>
- </nav>
- <hr>
- <h2 id="wikipediaandopenzim">Wikipedia and openzim</h2>
-
- <p>If you ever wanted to host your own wikipedia replica, here is the simplest
- way.</p>
-
- <p>As wikipedia is REALLY huge, you don’t really want to host a php wikimedia
- software and load the huge database, instead, the project made the <em>openzim</em>
- format to compress the huge database that wikipedia became while allowing using
- it for fast searches.</p>
-
- <p>Sadly, on OpenBSD, we have no software reading zim files and most software
- requires the library openzim to work which requires extra work to get it as a
- package on OpenBSD.</p>
-
- <p>Hopefully, there is a python package implementing all you need as pure python
- to serve zim files over http and it’s easy to install.</p>
-
- <p>This tutorial should work on all others unix like systems but packages or
- binary names may change.</p>
-
- <h2 id="downloadingwikipedia">Downloading wikipedia</h2>
-
- <p>The project Kiwix is responsible for wikipedia files, they create regularly
- files from various projects (including stackexchange, gutenberg, wikibooks
- etc…) but for this tutorial we want wikipedia:
- <a href="https://wiki.kiwix.org/wiki/Content_in_all_languages">https://wiki.kiwix.org/wiki/Content_in_all_languages</a></p>
-
- <p>You will find a lot of files, the language is contained into the filename. Some
- filenames will also self explain if they contain everything or categories, and
- if they have pictures or not.</p>
-
- <p>The full French file is 31.4 GB worth.</p>
-
- <h2 id="runningtheserver">Running the server</h2>
-
- <p>For the next steps, I recommend setting up a new user dedicated to this.</p>
-
- <p>On OpenBSD, we will require python3 and pip:</p>
-
- <pre><code>$ doas pkg_add py3-pip--
- </code></pre>
-
- <p>Then we can use pip to fetch and install dependencies for the zimply software,
- the flag <code>--user</code> is rather important as it allows any user to download and
- install python libraries in its home folder instead of polluting the whole
- system as root.</p>
-
- <pre><code>$ pip3.7 install --user --upgrade zimply
- </code></pre>
-
- <p>I wrote a small script to start the server using the zim file as a parameter, I
- rarely write python so the script may not be high standard.</p>
-
- <p>File <strong>server.py</strong>:</p>
-
- <pre><code>from zimply import ZIMServer
- import sys
- import os.path
-
- if len(sys.argv) == 1:
- print("usage: " + sys.argv[0] + " file")
- exit(1)
-
- if os.path.exists(sys.argv[1]):
- ZIMServer(sys.argv[1])
- else:
- print("Can't find file " + sys.argv[1])
- </code></pre>
-
- <p>And then you can start the server using the command:</p>
-
- <pre><code>$ python3.7 server.py /path/to/wikipedia_fr_all_maxi_2019-08.zim
- </code></pre>
-
- <p>You will be able to access wikipedia on the url http://localhost:9454/</p>
-
- <p>Note that this is not a “wiki” as you can’t see history and edit/create pages.</p>
-
- <p>This kind of backup is used in place like Cuba or Africa areas where people
- don’t have unlimited internet access, the project lead by Kiwix allow more
- people to access knowledge.</p>
- </article>
-
-
- <hr>
-
- <footer>
- <p>
- <a href="/david/" title="Aller à l’accueil"><svg class="icon icon-home">
- <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-home"></use>
- </svg> Accueil</a> •
- <a href="/david/log/" title="Accès au flux RSS"><svg class="icon icon-rss2">
- <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-rss2"></use>
- </svg> Suivre</a> •
- <a href="http://larlet.com" title="Go to my English profile" data-instant><svg class="icon icon-user-tie">
- <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-user-tie"></use>
- </svg> Pro</a> •
- <a href="mailto:david%40larlet.fr" title="Envoyer un courriel"><svg class="icon icon-mail">
- <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-mail"></use>
- </svg> Email</a> •
- <abbr class="nowrap" title="Hébergeur : Alwaysdata, 62 rue Tiquetonne 75002 Paris, +33184162340"><svg class="icon icon-hammer2">
- <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-hammer2"></use>
- </svg> Légal</abbr>
- </p>
- <template id="theme-selector">
- <form>
- <fieldset>
- <legend><svg class="icon icon-brightness-contrast">
- <use xlink:href="/static/david/icons2/symbol-defs.svg#icon-brightness-contrast"></use>
- </svg> Thème</legend>
- <label>
- <input type="radio" value="auto" name="chosen-color-scheme" checked> Auto
- </label>
- <label>
- <input type="radio" value="dark" name="chosen-color-scheme"> Foncé
- </label>
- <label>
- <input type="radio" value="light" name="chosen-color-scheme"> Clair
- </label>
- </fieldset>
- </form>
- </template>
- </footer>
- <script src="/static/david/js/instantpage-5.1.0.min.js" type="module"></script>
- <script>
- function loadThemeForm(templateName) {
- const themeSelectorTemplate = document.querySelector(templateName)
- const form = themeSelectorTemplate.content.firstElementChild
- themeSelectorTemplate.replaceWith(form)
-
- form.addEventListener('change', (e) => {
- const chosenColorScheme = e.target.value
- localStorage.setItem('theme', chosenColorScheme)
- toggleTheme(chosenColorScheme)
- })
-
- const selectedTheme = localStorage.getItem('theme')
- if (selectedTheme && selectedTheme !== 'undefined') {
- form.querySelector(`[value="${selectedTheme}"]`).checked = true
- }
- }
-
- const prefersColorSchemeDark = '(prefers-color-scheme: dark)'
- window.addEventListener('load', () => {
- let hasDarkRules = false
- for (const styleSheet of Array.from(document.styleSheets)) {
- let mediaRules = []
- for (const cssRule of styleSheet.cssRules) {
- if (cssRule.type !== CSSRule.MEDIA_RULE) {
- continue
- }
- // WARNING: Safari does not have/supports `conditionText`.
- if (cssRule.conditionText) {
- if (cssRule.conditionText !== prefersColorSchemeDark) {
- continue
- }
- } else {
- if (cssRule.cssText.startsWith(prefersColorSchemeDark)) {
- continue
- }
- }
- mediaRules = mediaRules.concat(Array.from(cssRule.cssRules))
- }
-
- // WARNING: do not try to insert a Rule to a styleSheet you are
- // currently iterating on, otherwise the browser will be stuck
- // in a infinite loop…
- for (const mediaRule of mediaRules) {
- styleSheet.insertRule(mediaRule.cssText)
- hasDarkRules = true
- }
- }
- if (hasDarkRules) {
- loadThemeForm('#theme-selector')
- }
- })
- </script>
- </body>
- </html>
|