123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752 |
- <!doctype html><!-- This is a valid HTML5 document. -->
- <!-- Screen readers, SEO, extensions and so on. -->
- <html lang=fr>
- <!-- Has to be within the first 1024 bytes, hence before the <title>
- See: https://www.w3.org/TR/2012/CR-html5-20121217/document-metadata.html#charset -->
- <meta charset=utf-8>
- <!-- Why no `X-UA-Compatible` meta: https://stackoverflow.com/a/6771584 -->
- <!-- The viewport meta is quite crowded and we are responsible for that.
- See: https://codepen.io/tigt/post/meta-viewport-for-2015 -->
- <meta name=viewport content="width=device-width,minimum-scale=1,initial-scale=1,shrink-to-fit=no">
- <!-- Required to make a valid HTML5 document. -->
- <title>Archiving web sites (archive) — David Larlet</title>
- <!-- Generated from https://realfavicongenerator.net/ such a mess. -->
- <link rel="apple-touch-icon" sizes="180x180" href="/static/david/icons/apple-touch-icon.png">
- <link rel="icon" type="image/png" sizes="32x32" href="/static/david/icons/favicon-32x32.png">
- <link rel="icon" type="image/png" sizes="16x16" href="/static/david/icons/favicon-16x16.png">
- <link rel="manifest" href="/manifest.json">
- <link rel="mask-icon" href="/static/david/icons/safari-pinned-tab.svg" color="#5bbad5">
- <link rel="shortcut icon" href="/static/david/icons/favicon.ico">
- <meta name="apple-mobile-web-app-title" content="David Larlet">
- <meta name="application-name" content="David Larlet">
- <meta name="msapplication-TileColor" content="#da532c">
- <meta name="msapplication-config" content="/static/david/icons/browserconfig.xml">
- <meta name="theme-color" content="#f0f0ea">
- <!-- That good ol' feed, subscribe :p. -->
- <link rel=alternate type="application/atom+xml" title=Feed href="/david/log/">
-
- <meta name="robots" content="noindex, nofollow">
- <meta content="origin-when-cross-origin" name="referrer">
- <!-- Canonical URL for SEO purposes -->
- <link rel="canonical" href="https://lwn.net/Articles/766374/">
-
- <style>
- /* http://meyerweb.com/eric/tools/css/reset/ */
- html, body, div, span,
- h1, h2, h3, h4, h5, h6, p, blockquote, pre,
- a, abbr, address, big, cite, code,
- del, dfn, em, img, ins,
- small, strike, strong, tt, var,
- dl, dt, dd, ol, ul, li,
- fieldset, form, label, legend,
- table, caption, tbody, tfoot, thead, tr, th, td,
- article, aside, canvas, details, embed,
- figure, figcaption, footer, header, hgroup,
- menu, nav, output, ruby, section, summary,
- time, mark, audio, video {
- margin: 0;
- padding: 0;
- border: 0;
- font-size: 100%;
- font: inherit;
- vertical-align: baseline;
- }
- /* HTML5 display-role reset for older browsers */
- article, aside, details, figcaption, figure,
- footer, header, hgroup, menu, nav, section { display: block; }
- body { line-height: 1; }
- blockquote, q { quotes: none; }
- blockquote:before, blockquote:after,
- q:before, q:after {
- content: '';
- content: none;
- }
- table {
- border-collapse: collapse;
- border-spacing: 0;
- }
-
- /* http://practicaltypography.com/equity.html */
- /* https://calendar.perfplanet.com/2016/no-font-face-bulletproof-syntax/ */
- /* https://www.filamentgroup.com/lab/js-web-fonts.html */
- @font-face {
- font-family: 'EquityTextB';
- src: url('/static/david/css/fonts/Equity-Text-B-Regular-webfont.woff2') format('woff2'),
- url('/static/david/css/fonts/Equity-Text-B-Regular-webfont.woff') format('woff');
- font-weight: 300;
- font-style: normal;
- font-display: swap;
- }
- @font-face {
- font-family: 'EquityTextB';
- src: url('/static/david/css/fonts/Equity-Text-B-Italic-webfont.woff2') format('woff2'),
- url('/static/david/css/fonts/Equity-Text-B-Italic-webfont.woff') format('woff');
- font-weight: 300;
- font-style: italic;
- font-display: swap;
- }
- @font-face {
- font-family: 'EquityTextB';
- src: url('/static/david/css/fonts/Equity-Text-B-Bold-webfont.woff2') format('woff2'),
- url('/static/david/css/fonts/Equity-Text-B-Bold-webfont.woff') format('woff');
- font-weight: 700;
- font-style: normal;
- font-display: swap;
- }
-
- @font-face {
- font-family: 'ConcourseT3';
- src: url('/static/david/css/fonts/concourse_t3_regular-webfont-20190806.woff2') format('woff2'),
- url('/static/david/css/fonts/concourse_t3_regular-webfont-20190806.woff') format('woff');
- font-weight: 300;
- font-style: normal;
- font-display: swap;
- }
-
-
- /* http://practice.typekit.com/lesson/caring-about-opentype-features/ */
- body {
- /* http://www.cssfontstack.com/ Palatino 99% Win 86% Mac */
- font-family: "EquityTextB", Palatino, serif;
- background-color: #f0f0ea;
- color: #07486c;
- font-kerning: normal;
- -moz-osx-font-smoothing: grayscale;
- -webkit-font-smoothing: subpixel-antialiased;
- text-rendering: optimizeLegibility;
- font-variant-ligatures: common-ligatures contextual;
- font-feature-settings: "kern", "liga", "clig", "calt";
- }
- pre, code, kbd, samp, var, tt {
- font-family: 'TriplicateT4c', monospace;
- }
- em {
- font-style: italic;
- color: #323a45;
- }
- strong {
- font-weight: bold;
- color: black;
- }
- nav {
- background-color: #323a45;
- color: #f0f0ea;
- display: flex;
- justify-content: space-around;
- padding: 1rem .5rem;
- }
- nav:last-child {
- border-bottom: 1vh solid #2d7474;
- }
- nav a {
- color: #f0f0ea;
- }
- nav abbr {
- border-bottom: 1px dotted white;
- }
-
- h1 {
- border-top: 1vh solid #2d7474;
- border-bottom: .2vh dotted #2d7474;
- background-color: #e3e1e1;
- color: #323a45;
- text-align: center;
- padding: 5rem 0 4rem 0;
- width: 100%;
- font-family: 'ConcourseT3';
- display: flex;
- flex-direction: column;
- }
- h1.single {
- padding-bottom: 10rem;
- }
- h1 span {
- position: absolute;
- top: 1vh;
- left: 20%;
- line-height: 0;
- }
- h1 span a {
- line-height: 1.7;
- padding: 1rem 1.2rem .6rem 1.2rem;
- border-radius: 0 0 6% 6%;
- background: #2d7474;
- font-size: 1.3rem;
- color: white;
- text-decoration: none;
- }
- h2 {
- margin: 4rem 0 1rem;
- border-top: .2vh solid #2d7474;
- padding-top: 1vh;
- }
- h3 {
- text-align: center;
- margin: 3rem 0 .75em;
- }
- hr {
- height: .4rem;
- width: .4rem;
- border-radius: .4rem;
- background: #07486c;
- margin: 2.5rem auto;
- }
- time {
- display: bloc;
- margin-left: 0 !important;
- }
- ul, ol {
- margin: 2rem;
- }
- ul {
- list-style-type: square;
- }
- a {
- text-decoration-skip-ink: auto;
- text-decoration-thickness: 0.05em;
- text-underline-offset: 0.09em;
- }
- article {
- max-width: 50rem;
- display: flex;
- flex-direction: column;
- margin: 2rem auto;
- }
- article.single {
- border-top: .2vh dotted #2d7474;
- margin: -6rem auto 1rem auto;
- background: #f0f0ea;
- padding: 2rem;
- }
- article p:last-child {
- margin-bottom: 1rem;
- }
- p {
- padding: 0 .5rem;
- margin-left: 3rem;
- }
- p + p,
- figure + p {
- margin-top: 2rem;
- }
-
- blockquote {
- background-color: #e3e1e1;
- border-left: .5vw solid #2d7474;
- display: flex;
- flex-direction: column;
- align-items: center;
- padding: 1rem;
- margin: 1.5rem;
- }
- blockquote cite {
- font-style: italic;
- }
- blockquote p {
- margin-left: 0;
- }
-
- figure {
- border-top: .2vh solid #2d7474;
- background-color: #e3e1e1;
- text-align: center;
- padding: 1.5rem 0;
- margin: 1rem 0 0;
- font-size: 1.5rem;
- width: 100%;
- }
- figure img {
- max-width: 250px;
- max-height: 250px;
- border: .5vw solid #323a45;
- padding: 1px;
- }
- figcaption {
- padding: 1rem;
- line-height: 1.4;
- }
- aside {
- display: flex;
- flex-direction: column;
- background-color: #e3e1e1;
- padding: 1rem 0;
- border-bottom: .2vh solid #07486c;
- }
- aside p {
- max-width: 50rem;
- margin: 0 auto;
- }
-
- /* https://fvsch.com/code/css-locks/ */
- p, li, pre, code, kbd, samp, var, tt, time, details, figcaption {
- font-size: 1rem;
- line-height: calc( 1.5em + 0.2 * 1rem );
- }
- h1 {
- font-size: 1.9rem;
- line-height: calc( 1.2em + 0.2 * 1rem );
- }
- h2 {
- font-size: 1.6rem;
- line-height: calc( 1.3em + 0.2 * 1rem );
- }
- h3 {
- font-size: 1.35rem;
- line-height: calc( 1.4em + 0.2 * 1rem );
- }
- @media (min-width: 20em) {
- /* The (100vw - 20rem) / (50 - 20) part
- resolves to 0-1rem, depending on the
- viewport width (between 20em and 50em). */
- p, li, pre, code, kbd, samp, var, tt, time, details, figcaption {
- font-size: calc( 1rem + .6 * (100vw - 20rem) / (50 - 20) );
- line-height: calc( 1.5em + 0.2 * (100vw - 50rem) / (20 - 50) );
- margin-left: 0;
- }
- h1 {
- font-size: calc( 1.9rem + 1.5 * (100vw - 20rem) / (50 - 20) );
- line-height: calc( 1.2em + 0.2 * (100vw - 50rem) / (20 - 50) );
- }
- h2 {
- font-size: calc( 1.5rem + 1.5 * (100vw - 20rem) / (50 - 20) );
- line-height: calc( 1.3em + 0.2 * (100vw - 50rem) / (20 - 50) );
- }
- h3 {
- font-size: calc( 1.35rem + 1.5 * (100vw - 20rem) / (50 - 20) );
- line-height: calc( 1.4em + 0.2 * (100vw - 50rem) / (20 - 50) );
- }
- }
- @media (min-width: 50em) {
- /* The right part of the addition *must* be a
- rem value. In this example we *could* change
- the whole declaration to font-size:2.5rem,
- but if our baseline value was not expressed
- in rem we would have to use calc. */
- p, li, pre, code, kbd, samp, var, tt, time, details, figcaption {
- font-size: calc( 1rem + .6 * 1rem );
- line-height: 1.5em;
- }
- p, li, pre, details {
- margin-left: 3rem;
- }
- h1 {
- font-size: calc( 1.9rem + 1.5 * 1rem );
- line-height: 1.2em;
- }
- h2 {
- font-size: calc( 1.5rem + 1.5 * 1rem );
- line-height: 1.3em;
- }
- h3 {
- font-size: calc( 1.35rem + 1.5 * 1rem );
- line-height: 1.4em;
- }
- figure img {
- max-width: 500px;
- max-height: 500px;
- }
- }
-
- figure.unsquared {
- margin-bottom: 1.5rem;
- }
- figure.unsquared img {
- height: inherit;
- }
-
-
-
- @media print {
- body { font-size: 100%; }
- a:after { content: " (" attr(href) ")"; }
- a, a:link, a:visited, a:after {
- text-decoration: underline;
- text-shadow: none !important;
- background-image: none !important;
- background: white;
- color: black;
- }
- abbr[title] { border-bottom: 0; }
- abbr[title]:after { content: " (" attr(title) ")"; }
- img { page-break-inside: avoid; }
- @page { margin: 2cm .5cm; }
- h1, h2, h3 { page-break-after: avoid; }
- p3 { orphans: 3; widows: 3; }
- img {
- max-width: 250px !important;
- max-height: 250px !important;
- }
- nav, aside { display: none; }
- }
-
- ul.with_columns {
- column-count: 1;
- }
- @media (min-width: 20em) {
- ul.with_columns {
- column-count: 2;
- }
- }
- @media (min-width: 50em) {
- ul.with_columns {
- column-count: 3;
- }
- }
- ul.with_two_columns {
- column-count: 1;
- }
- @media (min-width: 20em) {
- ul.with_two_columns {
- column-count: 1;
- }
- }
- @media (min-width: 50em) {
- ul.with_two_columns {
- column-count: 2;
- }
- }
-
- .gallery {
- display: flex;
- flex-wrap: wrap;
- justify-content: space-around;
- }
- .gallery figure img {
- margin-left: 1rem;
- margin-right: 1rem;
- }
- .gallery figure figcaption {
- font-family: 'ConcourseT3'
- }
-
- footer {
- font-family: 'ConcourseT3';
- display: flex;
- flex-direction: column;
- border-top: 3px solid white;
- padding: 4rem 0;
- background-color: #07486c;
- color: white;
- }
- footer > * {
- max-width: 50rem;
- margin: 0 auto;
- }
- footer a {
- color: #f1c40f;
- }
- footer .avatar {
- width: 200px;
- height: 200px;
- border-radius: 50%;
- float: left;
- -webkit-shape-outside: circle();
- shape-outside: circle();
- margin-right: 2rem;
- padding: 2px 5px 5px 2px;
- background: white;
- border-left: 1px solid #f1c40f;
- border-top: 1px solid #f1c40f;
- border-right: 5px solid #f1c40f;
- border-bottom: 5px solid #f1c40f;
- }
- </style>
-
- <h1>
- <span><a id="jumper" href="#jumpto" title="Un peu perdu ?">?</a></span>
- Archiving web sites (archive)
- <time>Pour la pérennité des contenus liés. Non-indexé, retrait sur simple email.</time>
- </h1>
- <section>
- <article>
- <h3><a href="https://lwn.net/Articles/766374/">Source originale du contenu</a></h3>
- <p>I recently took a deep dive into web site archival for friends who
- were worried about losing control over the hosting of their work
- online in the face of poor system administration or hostile
- removal.
- This makes web site archival an essential instrument in the
- toolbox of any system administrator.
- As it turns out, some sites are much harder to archive than
- others. This article goes through the process of archiving traditional
- web sites and shows how it falls short when confronted with the latest
- fashions in the single-page applications that are bloating the modern web.</p>
-
- <h4>Converting simple sites</h4>
-
- <p>The days of handcrafted HTML web sites are long gone. Now web sites are
- dynamic and built on the fly using the latest JavaScript, PHP, or
- Python framework. As a result, the sites are more fragile: a database
- crash, spurious upgrade, or unpatched vulnerability might lose data.
- In my previous life as web developer, I
- had to come to terms with the idea that customers expect web sites to
- basically work forever. This expectation matches poorly with "move
- fast and break things" attitude of web development. Working with the
- <a href="https://drupal.org">Drupal</a> content-management system (CMS) was
- particularly
- challenging in that regard as major upgrades deliberately break
- compatibility with third-party modules, which implies a costly upgrade process that
- clients could seldom afford. The solution was to archive those sites:
- take a living, dynamic web site and turn it into plain HTML files that
- any web server can serve forever. This process is useful for your own dynamic
- sites but also for third-party sites that are outside of your control and you might want
- to safeguard.</p>
-
- <p>For simple or static sites, the venerable <a href="https://www.gnu.org/software/wget/">Wget</a> program works
- well. The incantation to mirror a full web site, however, is byzantine:</p>
-
- <pre>
- $ nice wget --mirror --execute robots=off --no-verbose --convert-links \
- --backup-converted --page-requisites --adjust-extension \
- --base=./ --directory-prefix=./ --span-hosts \
- --domains=www.example.com,example.com http://www.example.com/
- </pre>
-
- <p>The above downloads the content of the web page, but also crawls
- everything within the specified domains. Before you run this against
- your favorite site, consider the impact such a crawl might have on the
- site. The above command line deliberately ignores
- <a href="https://en.wikipedia.org/wiki/Robots_exclusion_standard"><tt>robots.txt</tt></a>
- rules, as is now <a href="https://blog.archive.org/2017/04/17/robots-txt-meant-for-search-engines-dont-work-well-for-web-archives/">common practice for archivists</a>,
- and hammer the website as fast as it can. Most crawlers have options to
- pause between hits and limit bandwidth usage to avoid overwhelming the
- target site.
-
- </p>
-
- <p>
- The above command will also fetch "page
- requisites" like style sheets (CSS), images, and scripts. The
- downloaded page contents are modified so that links point to the local
- copy as well. Any web server can host the resulting file set, which results
- in a static copy of the original web site.</p>
-
- <p>That is, when things go well. Anyone who has ever worked with a computer
- knows that things seldom go according to plan; all sorts of
- things can make the procedure derail in interesting ways. For example,
- it was trendy for a while to have calendar blocks in web sites. A CMS
- would generate those on the fly and make crawlers go into an infinite
- loop trying to retrieve all of the pages. Crafty archivers can resort to regular expressions
- (e.g. Wget has a <code>--reject-regex</code> option) to ignore problematic
- resources. Another option, if the administration interface for the
- web site is accessible, is to disable calendars, login forms, comment
- forms, and other dynamic areas. Once the site becomes static, those
- will stop working anyway, so it makes sense to remove such clutter
- from the original site as well.</p>
-
- <h4>JavaScript doom</h4>
-
- <p>Unfortunately, some web sites are built with much more than pure
- HTML. In single-page sites, for example, the web browser builds the
- content itself by executing a small JavaScript program. A simple user
- agent like Wget will struggle to reconstruct a meaningful static copy
- of those sites as it does not support JavaScript at all. In theory, web
- sites should be using <a href="https://en.wikipedia.org/wiki/Progressive_enhancement">progressive
- enhancement</a> to have content and
- functionality available without JavaScript but those directives are
- rarely followed, as anyone using plugins like <a href="https://noscript.net/">NoScript</a> or
- <a href="https://github.com/gorhill/uMatrix">uMatrix</a> will confirm.</p>
-
- <p>Traditional archival methods sometimes fail in the dumbest way. When
- trying to build an offsite backup of a local newspaper
- (<a href="https://pamplemousse.ca/">pamplemousse.ca</a>), I found that
- WordPress adds query strings
- (e.g. <code>?ver=1.12.4</code>) at the end of JavaScript includes. This confuses
- content-type detection in the web servers that serve the archive, which
- rely on the file extension
- to send the right <code>Content-Type</code> header. When such an archive is
- loaded in a
- web browser, it fails to load scripts, which breaks dynamic websites.</p>
-
- <p>As the web moves toward using the browser as a virtual machine to run
- arbitrary code, archival methods relying on pure HTML parsing need to
- adapt. The solution for such problems is to record (and replay) the
- HTTP headers delivered by the server during the crawl and indeed
- professional archivists use just such an approach.</p>
-
- <h4>Creating and displaying WARC files</h4>
-
- <p>At the <a href="https://archive.org">Internet Archive</a>, Brewster
- Kahle and Mike Burner designed
- the <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">ARC</a> (for "ARChive") file format in 1996 to provide a way to
- aggregate the millions of small files produced by their archival
- efforts. The format was eventually standardized as the WARC ("Web
- ARChive") <a href="https://iipc.github.io/warc-specifications/">specification</a> that
- was released as an ISO standard in 2009 and
- revised in 2017. The standardization effort was led by the <a href="https://en.wikipedia.org/wiki/International_Internet_Preservation_Consortium">International Internet
- Preservation Consortium</a> (IIPC), which is an "<span>international
- organization of libraries and other organizations established to
- coordinate efforts to preserve internet content for the future</span>",
- according to Wikipedia; it includes members such as the US Library of
- Congress and the Internet Archive. The latter uses the WARC format
- internally in its Java-based <a href="https://github.com/internetarchive/heritrix3/wiki">Heritrix
- crawler</a>.</p>
-
- <p>A WARC file aggregates multiple resources like HTTP headers, file
- contents, and other metadata in a single compressed
- archive. Conveniently, Wget actually supports the file format with
- the <code>--warc</code> parameter. Unfortunately, web browsers cannot render WARC
- files directly, so a viewer or some conversion is necessary to access
- the archive. The simplest such viewer I have found is <a href="https://github.com/webrecorder/pywb">pywb</a>, a
- Python package that runs a simple webserver to offer a
- Wayback-Machine-like interface to browse the contents of WARC
- files. The following set of commands will render a WARC file on
- <tt>http://localhost:8080/</tt>:</p>
-
- <pre>
- $ pip install pywb
- $ wb-manager init example
- $ wb-manager add example crawl.warc.gz
- $ wayback
- </pre>
-
- <p>This tool was, incidentally, built by the folks behind the
- <a href="https://webrecorder.io/">Webrecorder</a> service, which can use
- a web browser to save
- dynamic page contents.</p>
-
- <p>Unfortunately, pywb has trouble loading WARC files generated by Wget
- because it <a href="https://github.com/webrecorder/pywb/issues/294">followed</a> an <a href="https://github.com/iipc/warc-specifications/issues/23">inconsistency in the 1.0
- specification</a>, which was <a href="https://github.com/iipc/warc-specifications/pull/24">fixed in the 1.1 specification</a>. Until Wget or
- pywb fix those problems, WARC files produced by Wget are not
- reliable enough for my uses, so I have looked at other alternatives. A
- crawler that got my attention is simply called <a href="https://git.autistici.org/ale/crawl/">crawl</a>. Here is how
- it is invoked:</p>
-
- <pre>
- $ crawl https://example.com/
- </pre>
-
- <p>(It <em>does</em> say "very simple" in the README.) The program does support
- some command-line options, but most of its defaults are sane: it will fetch
- page requirements from other domains (unless the <code>-exclude-related</code>
- flag is used), but does not recurse out of the domain. By default, it
- fires up ten parallel connections to the remote site, a setting that
- can be changed with the <code>-c</code> flag. But, best of all, the resulting WARC
- files load perfectly in pywb.</p>
-
- <h4>Future work and alternatives</h4>
-
- <p>There are plenty more <a href="https://archiveteam.org/index.php?title=The_WARC_Ecosystem">resources</a>
- for using WARC files. In
- particular, there's a Wget drop-in replacement called <a href="https://github.com/chfoo/wpull">Wpull</a> that is
- specifically designed for archiving web sites. It has experimental
- support for <a href="http://phantomjs.org/">PhantomJS</a> and <a href="http://rg3.github.io/youtube-dl/">youtube-dl</a> integration that
- should allow downloading more complex JavaScript sites and streaming
- multimedia, respectively. The software is the basis for an elaborate
- archival tool called <a href="https://www.archiveteam.org/index.php?title=ArchiveBot">ArchiveBot</a>,
- which is used by the "<span>loose collective of
- rogue archivists, programmers, writers and loudmouths</span>" at
- <a href="https://archiveteam.org/">ArchiveTeam</a> in its struggle to
- "<span>save the history before it's lost
- forever</span>". It seems that PhantomJS integration does not work as well as
- the team wants, so ArchiveTeam also uses a rag-tag bunch of other
- tools to mirror more complex sites. For example, <a href="https://github.com/JustAnotherArchivist/snscrape">snscrape</a> will
- crawl a social media profile to generate a list of pages to send into
- ArchiveBot. Another tool the team employs is <a href="https://github.com/PromyLOPh/crocoite">crocoite</a>, which uses
- the Chrome browser in headless mode to archive JavaScript-heavy sites.</p>
-
- <p>This article would also not be complete without a nod to the
- <a href="http://www.httrack.com/">HTTrack</a> project, the "website
- copier". Working similarly to Wget,
- HTTrack creates local copies of remote web sites but unfortunately does
- not support WARC output. Its interactive aspects might be of more
- interest to novice users unfamiliar with the command line.
-
- </p>
-
- <p>
- In the
- same vein, during my research I found a full rewrite of Wget called
- <a href="https://gitlab.com/gnuwget/wget2">Wget2</a> that has support for
- multi-threaded operation, which might make
- it faster than its predecessor. It is <a href="https://gitlab.com/gnuwget/wget2/wikis/home">missing some
- features</a> from
- Wget, however, most notably reject patterns, WARC output, and FTP support but
- adds RSS, DNS caching, and improved TLS support.</p>
-
- <p>Finally, my personal dream for these kinds of tools would be to have
- them integrated with my existing bookmark system. I currently keep
- interesting links in <a href="https://wallabag.org/">Wallabag</a>, a
- self-hosted "read it later"
- service designed as a free-software alternative to <a href="https://getpocket.com/">Pocket</a> (now owned by
- Mozilla). But Wallabag, by design, creates only a
- "readable" version of the article instead of a full copy. In some
- cases, the "readable version" is actually <a href="https://github.com/wallabag/wallabag/issues/2825">unreadable</a> and Wallabag
- sometimes <a href="https://github.com/wallabag/wallabag/issues/2914">fails to parse the article</a>. Instead, other tools like
- <a href="https://pirate.github.io/bookmark-archiver/">bookmark-archiver</a>
- or <a href="https://github.com/kanishka-linux/reminiscence">reminiscence</a> save
- a screenshot of the
- page along with full HTML but, unfortunately, no WARC file that would
- allow an even more faithful replay.</p>
-
- <p>The sad truth of my experiences with mirrors and archival is that data
- dies. Fortunately,
- amateur archivists have tools at their disposal to keep interesting
- content alive online. For those who do not want to go through that
- trouble, the Internet Archive seems to be here to stay and Archive
- Team is obviously <a href="http://iabak.archiveteam.org">working on a
- backup of the Internet Archive itself</a>.</p>
- </article>
- </section>
-
-
- <nav id="jumpto">
- <p>
- <a href="/david/blog/">Accueil du blog</a> |
- <a href="https://lwn.net/Articles/766374/">Source originale</a> |
- <a href="/david/stream/2019/">Accueil du flux</a>
- </p>
- </nav>
-
- <footer>
- <div>
- <img src="/static/david/david-larlet-avatar.jpg" loading="lazy" class="avatar" width="200" height="200">
- <p>
- Bonjour/Hi!
- Je suis <a href="/david/" title="Profil public">David Larlet</a>, je vis actuellement à Montréal et j’alimente cet espace depuis 15 ans. <br>
- Si tu as apprécié cette lecture, n’hésite pas à poursuivre ton exploration. Par exemple via les <a href="/david/blog/" title="Expériences bienveillantes">réflexions bimestrielles</a>, la <a href="/david/stream/2019/" title="Pensées (dés)articulées">veille hebdomadaire</a> ou en t’abonnant au <a href="/david/log/" title="S’abonner aux publications via RSS">flux RSS</a> (<a href="/david/blog/2019/flux-rss/" title="Tiens c’est quoi un flux RSS ?">so 2005</a>).
- </p>
- <p>
- Je m’intéresse à la place que je peux avoir dans ce monde. En tant qu’humain, en tant que membre d’une famille et en tant qu’associé d’une coopérative. De temps en temps, je fais aussi des <a href="https://github.com/davidbgk" title="Principalement sur Github mais aussi ailleurs">trucs techniques</a>. Et encore plus rarement, <a href="/david/talks/" title="En ce moment je laisse plutôt la place aux autres">j’en parle</a>.
- </p>
-
- <p>
- Voici quelques articles choisis :
- <a href="/david/blog/2019/faire-equipe/" title="Accéder à l’article complet">Faire équipe</a>,
- <a href="/david/blog/2018/bivouac-automnal/" title="Accéder à l’article complet">Bivouac automnal</a>,
- <a href="/david/blog/2018/commodite-effondrement/" title="Accéder à l’article complet">Commodité et effondrement</a>,
- <a href="/david/blog/2017/donnees-communs/" title="Accéder à l’article complet">Des données aux communs</a>,
- <a href="/david/blog/2016/accompagner-enfant/" title="Accéder à l’article complet">Accompagner un enfant</a>,
- <a href="/david/blog/2016/senior-developer/" title="Accéder à l’article complet">Senior developer</a>,
- <a href="/david/blog/2016/illusion-sociale/" title="Accéder à l’article complet">L’illusion sociale</a>,
- <a href="/david/blog/2016/instantane-scopyleft/" title="Accéder à l’article complet">Instantané Scopyleft</a>,
- <a href="/david/blog/2016/enseigner-web/" title="Accéder à l’article complet">Enseigner le Web</a>,
- <a href="/david/blog/2016/simplicite-defaut/" title="Accéder à l’article complet">Simplicité par défaut</a>,
- <a href="/david/blog/2016/minimalisme-esthetique/" title="Accéder à l’article complet">Minimalisme et esthétique</a>,
- <a href="/david/blog/2014/un-web-omni-present/" title="Accéder à l’article complet">Un web omni-présent</a>,
- <a href="/david/blog/2014/manifeste-developpeur/" title="Accéder à l’article complet">Manifeste de développeur</a>,
- <a href="/david/blog/2013/confort-convivialite/" title="Accéder à l’article complet">Confort et convivialité</a>,
- <a href="/david/blog/2013/testament-numerique/" title="Accéder à l’article complet">Testament numérique</a>,
- et <a href="/david/blog/" title="Accéder aux archives">bien d’autres…</a>
- </p>
- <p>
- On peut <a href="mailto:david%40larlet.fr" title="Envoyer un courriel">échanger par courriel</a>. Si éventuellement tu souhaites que l’on travaille ensemble, tu devrais commencer par consulter le <a href="http://larlet.com">profil dédié à mon activité professionnelle</a> et/ou contacter directement <a href="http://scopyleft.fr/">scopyleft</a>, la <abbr title="Société coopérative et participative">SCOP</abbr> dont je fais partie depuis six ans. Je recommande au préalable de lire <a href="/david/blog/2018/cout-site/" title="Attention ce qui va suivre peut vous choquer">combien coûte un site</a> et pourquoi je suis plutôt favorable à une <a href="/david/pro/devis/" title="Discutons-en !">non-demande de devis</a>.
- </p>
- <p>
- Je ne traque pas ta navigation mais mon
- <abbr title="Alwaysdata, 62 rue Tiquetonne 75002 Paris, +33.184162340">hébergeur</abbr>
- conserve des logs d’accès.
- </p>
- </div>
- </footer>
- <script type="text/javascript">
- ;(_ => {
- const jumper = document.getElementById('jumper')
- jumper.addEventListener('click', e => {
- e.preventDefault()
- const anchor = e.target.getAttribute('href')
- const targetEl = document.getElementById(anchor.substring(1))
- targetEl.scrollIntoView({behavior: 'smooth'})
- })
- })()
- </script>
|