A place to cache linked articles (think custom and personal wayback machine)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

index.html 29KB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. <!doctype html><!-- This is a valid HTML5 document. -->
  2. <!-- Screen readers, SEO, extensions and so on. -->
  3. <html lang=fr>
  4. <!-- Has to be within the first 1024 bytes, hence before the <title>
  5. See: https://www.w3.org/TR/2012/CR-html5-20121217/document-metadata.html#charset -->
  6. <meta charset=utf-8>
  7. <!-- Why no `X-UA-Compatible` meta: https://stackoverflow.com/a/6771584 -->
  8. <!-- The viewport meta is quite crowded and we are responsible for that.
  9. See: https://codepen.io/tigt/post/meta-viewport-for-2015 -->
  10. <meta name=viewport content="width=device-width,minimum-scale=1,initial-scale=1,shrink-to-fit=no">
  11. <!-- Required to make a valid HTML5 document. -->
  12. <title>Archiving web sites (archive) — David Larlet</title>
  13. <!-- Generated from https://realfavicongenerator.net/ such a mess. -->
  14. <link rel="apple-touch-icon" sizes="180x180" href="/static/david/icons/apple-touch-icon.png">
  15. <link rel="icon" type="image/png" sizes="32x32" href="/static/david/icons/favicon-32x32.png">
  16. <link rel="icon" type="image/png" sizes="16x16" href="/static/david/icons/favicon-16x16.png">
  17. <link rel="manifest" href="/manifest.json">
  18. <link rel="mask-icon" href="/static/david/icons/safari-pinned-tab.svg" color="#5bbad5">
  19. <link rel="shortcut icon" href="/static/david/icons/favicon.ico">
  20. <meta name="apple-mobile-web-app-title" content="David Larlet">
  21. <meta name="application-name" content="David Larlet">
  22. <meta name="msapplication-TileColor" content="#da532c">
  23. <meta name="msapplication-config" content="/static/david/icons/browserconfig.xml">
  24. <meta name="theme-color" content="#f0f0ea">
  25. <!-- That good ol' feed, subscribe :p. -->
  26. <link rel=alternate type="application/atom+xml" title=Feed href="/david/log/">
  27. <meta name="robots" content="noindex, nofollow">
  28. <meta content="origin-when-cross-origin" name="referrer">
  29. <!-- Canonical URL for SEO purposes -->
  30. <link rel="canonical" href="https://lwn.net/Articles/766374/">
  31. <style>
  32. /* http://meyerweb.com/eric/tools/css/reset/ */
  33. html, body, div, span,
  34. h1, h2, h3, h4, h5, h6, p, blockquote, pre,
  35. a, abbr, address, big, cite, code,
  36. del, dfn, em, img, ins,
  37. small, strike, strong, tt, var,
  38. dl, dt, dd, ol, ul, li,
  39. fieldset, form, label, legend,
  40. table, caption, tbody, tfoot, thead, tr, th, td,
  41. article, aside, canvas, details, embed,
  42. figure, figcaption, footer, header, hgroup,
  43. menu, nav, output, ruby, section, summary,
  44. time, mark, audio, video {
  45. margin: 0;
  46. padding: 0;
  47. border: 0;
  48. font-size: 100%;
  49. font: inherit;
  50. vertical-align: baseline;
  51. }
  52. /* HTML5 display-role reset for older browsers */
  53. article, aside, details, figcaption, figure,
  54. footer, header, hgroup, menu, nav, section { display: block; }
  55. body { line-height: 1; }
  56. blockquote, q { quotes: none; }
  57. blockquote:before, blockquote:after,
  58. q:before, q:after {
  59. content: '';
  60. content: none;
  61. }
  62. table {
  63. border-collapse: collapse;
  64. border-spacing: 0;
  65. }
  66. /* http://practicaltypography.com/equity.html */
  67. /* https://calendar.perfplanet.com/2016/no-font-face-bulletproof-syntax/ */
  68. /* https://www.filamentgroup.com/lab/js-web-fonts.html */
  69. @font-face {
  70. font-family: 'EquityTextB';
  71. src: url('/static/david/css/fonts/Equity-Text-B-Regular-webfont.woff2') format('woff2'),
  72. url('/static/david/css/fonts/Equity-Text-B-Regular-webfont.woff') format('woff');
  73. font-weight: 300;
  74. font-style: normal;
  75. font-display: swap;
  76. }
  77. @font-face {
  78. font-family: 'EquityTextB';
  79. src: url('/static/david/css/fonts/Equity-Text-B-Italic-webfont.woff2') format('woff2'),
  80. url('/static/david/css/fonts/Equity-Text-B-Italic-webfont.woff') format('woff');
  81. font-weight: 300;
  82. font-style: italic;
  83. font-display: swap;
  84. }
  85. @font-face {
  86. font-family: 'EquityTextB';
  87. src: url('/static/david/css/fonts/Equity-Text-B-Bold-webfont.woff2') format('woff2'),
  88. url('/static/david/css/fonts/Equity-Text-B-Bold-webfont.woff') format('woff');
  89. font-weight: 700;
  90. font-style: normal;
  91. font-display: swap;
  92. }
  93. @font-face {
  94. font-family: 'ConcourseT3';
  95. src: url('/static/david/css/fonts/concourse_t3_regular-webfont-20190806.woff2') format('woff2'),
  96. url('/static/david/css/fonts/concourse_t3_regular-webfont-20190806.woff') format('woff');
  97. font-weight: 300;
  98. font-style: normal;
  99. font-display: swap;
  100. }
  101. /* http://practice.typekit.com/lesson/caring-about-opentype-features/ */
  102. body {
  103. /* http://www.cssfontstack.com/ Palatino 99% Win 86% Mac */
  104. font-family: "EquityTextB", Palatino, serif;
  105. background-color: #f0f0ea;
  106. color: #07486c;
  107. font-kerning: normal;
  108. -moz-osx-font-smoothing: grayscale;
  109. -webkit-font-smoothing: subpixel-antialiased;
  110. text-rendering: optimizeLegibility;
  111. font-variant-ligatures: common-ligatures contextual;
  112. font-feature-settings: "kern", "liga", "clig", "calt";
  113. }
  114. pre, code, kbd, samp, var, tt {
  115. font-family: 'TriplicateT4c', monospace;
  116. }
  117. em {
  118. font-style: italic;
  119. color: #323a45;
  120. }
  121. strong {
  122. font-weight: bold;
  123. color: black;
  124. }
  125. nav {
  126. background-color: #323a45;
  127. color: #f0f0ea;
  128. display: flex;
  129. justify-content: space-around;
  130. padding: 1rem .5rem;
  131. }
  132. nav:last-child {
  133. border-bottom: 1vh solid #2d7474;
  134. }
  135. nav a {
  136. color: #f0f0ea;
  137. }
  138. nav abbr {
  139. border-bottom: 1px dotted white;
  140. }
  141. h1 {
  142. border-top: 1vh solid #2d7474;
  143. border-bottom: .2vh dotted #2d7474;
  144. background-color: #e3e1e1;
  145. color: #323a45;
  146. text-align: center;
  147. padding: 5rem 0 4rem 0;
  148. width: 100%;
  149. font-family: 'ConcourseT3';
  150. display: flex;
  151. flex-direction: column;
  152. }
  153. h1.single {
  154. padding-bottom: 10rem;
  155. }
  156. h1 span {
  157. position: absolute;
  158. top: 1vh;
  159. left: 20%;
  160. line-height: 0;
  161. }
  162. h1 span a {
  163. line-height: 1.7;
  164. padding: 1rem 1.2rem .6rem 1.2rem;
  165. border-radius: 0 0 6% 6%;
  166. background: #2d7474;
  167. font-size: 1.3rem;
  168. color: white;
  169. text-decoration: none;
  170. }
  171. h2 {
  172. margin: 4rem 0 1rem;
  173. border-top: .2vh solid #2d7474;
  174. padding-top: 1vh;
  175. }
  176. h3 {
  177. text-align: center;
  178. margin: 3rem 0 .75em;
  179. }
  180. hr {
  181. height: .4rem;
  182. width: .4rem;
  183. border-radius: .4rem;
  184. background: #07486c;
  185. margin: 2.5rem auto;
  186. }
  187. time {
  188. display: bloc;
  189. margin-left: 0 !important;
  190. }
  191. ul, ol {
  192. margin: 2rem;
  193. }
  194. ul {
  195. list-style-type: square;
  196. }
  197. a {
  198. text-decoration-skip-ink: auto;
  199. text-decoration-thickness: 0.05em;
  200. text-underline-offset: 0.09em;
  201. }
  202. article {
  203. max-width: 50rem;
  204. display: flex;
  205. flex-direction: column;
  206. margin: 2rem auto;
  207. }
  208. article.single {
  209. border-top: .2vh dotted #2d7474;
  210. margin: -6rem auto 1rem auto;
  211. background: #f0f0ea;
  212. padding: 2rem;
  213. }
  214. article p:last-child {
  215. margin-bottom: 1rem;
  216. }
  217. p {
  218. padding: 0 .5rem;
  219. margin-left: 3rem;
  220. }
  221. p + p,
  222. figure + p {
  223. margin-top: 2rem;
  224. }
  225. blockquote {
  226. background-color: #e3e1e1;
  227. border-left: .5vw solid #2d7474;
  228. display: flex;
  229. flex-direction: column;
  230. align-items: center;
  231. padding: 1rem;
  232. margin: 1.5rem;
  233. }
  234. blockquote cite {
  235. font-style: italic;
  236. }
  237. blockquote p {
  238. margin-left: 0;
  239. }
  240. figure {
  241. border-top: .2vh solid #2d7474;
  242. background-color: #e3e1e1;
  243. text-align: center;
  244. padding: 1.5rem 0;
  245. margin: 1rem 0 0;
  246. font-size: 1.5rem;
  247. width: 100%;
  248. }
  249. figure img {
  250. max-width: 250px;
  251. max-height: 250px;
  252. border: .5vw solid #323a45;
  253. padding: 1px;
  254. }
  255. figcaption {
  256. padding: 1rem;
  257. line-height: 1.4;
  258. }
  259. aside {
  260. display: flex;
  261. flex-direction: column;
  262. background-color: #e3e1e1;
  263. padding: 1rem 0;
  264. border-bottom: .2vh solid #07486c;
  265. }
  266. aside p {
  267. max-width: 50rem;
  268. margin: 0 auto;
  269. }
  270. /* https://fvsch.com/code/css-locks/ */
  271. p, li, pre, code, kbd, samp, var, tt, time, details, figcaption {
  272. font-size: 1rem;
  273. line-height: calc( 1.5em + 0.2 * 1rem );
  274. }
  275. h1 {
  276. font-size: 1.9rem;
  277. line-height: calc( 1.2em + 0.2 * 1rem );
  278. }
  279. h2 {
  280. font-size: 1.6rem;
  281. line-height: calc( 1.3em + 0.2 * 1rem );
  282. }
  283. h3 {
  284. font-size: 1.35rem;
  285. line-height: calc( 1.4em + 0.2 * 1rem );
  286. }
  287. @media (min-width: 20em) {
  288. /* The (100vw - 20rem) / (50 - 20) part
  289. resolves to 0-1rem, depending on the
  290. viewport width (between 20em and 50em). */
  291. p, li, pre, code, kbd, samp, var, tt, time, details, figcaption {
  292. font-size: calc( 1rem + .6 * (100vw - 20rem) / (50 - 20) );
  293. line-height: calc( 1.5em + 0.2 * (100vw - 50rem) / (20 - 50) );
  294. margin-left: 0;
  295. }
  296. h1 {
  297. font-size: calc( 1.9rem + 1.5 * (100vw - 20rem) / (50 - 20) );
  298. line-height: calc( 1.2em + 0.2 * (100vw - 50rem) / (20 - 50) );
  299. }
  300. h2 {
  301. font-size: calc( 1.5rem + 1.5 * (100vw - 20rem) / (50 - 20) );
  302. line-height: calc( 1.3em + 0.2 * (100vw - 50rem) / (20 - 50) );
  303. }
  304. h3 {
  305. font-size: calc( 1.35rem + 1.5 * (100vw - 20rem) / (50 - 20) );
  306. line-height: calc( 1.4em + 0.2 * (100vw - 50rem) / (20 - 50) );
  307. }
  308. }
  309. @media (min-width: 50em) {
  310. /* The right part of the addition *must* be a
  311. rem value. In this example we *could* change
  312. the whole declaration to font-size:2.5rem,
  313. but if our baseline value was not expressed
  314. in rem we would have to use calc. */
  315. p, li, pre, code, kbd, samp, var, tt, time, details, figcaption {
  316. font-size: calc( 1rem + .6 * 1rem );
  317. line-height: 1.5em;
  318. }
  319. p, li, pre, details {
  320. margin-left: 3rem;
  321. }
  322. h1 {
  323. font-size: calc( 1.9rem + 1.5 * 1rem );
  324. line-height: 1.2em;
  325. }
  326. h2 {
  327. font-size: calc( 1.5rem + 1.5 * 1rem );
  328. line-height: 1.3em;
  329. }
  330. h3 {
  331. font-size: calc( 1.35rem + 1.5 * 1rem );
  332. line-height: 1.4em;
  333. }
  334. figure img {
  335. max-width: 500px;
  336. max-height: 500px;
  337. }
  338. }
  339. figure.unsquared {
  340. margin-bottom: 1.5rem;
  341. }
  342. figure.unsquared img {
  343. height: inherit;
  344. }
  345. @media print {
  346. body { font-size: 100%; }
  347. a:after { content: " (" attr(href) ")"; }
  348. a, a:link, a:visited, a:after {
  349. text-decoration: underline;
  350. text-shadow: none !important;
  351. background-image: none !important;
  352. background: white;
  353. color: black;
  354. }
  355. abbr[title] { border-bottom: 0; }
  356. abbr[title]:after { content: " (" attr(title) ")"; }
  357. img { page-break-inside: avoid; }
  358. @page { margin: 2cm .5cm; }
  359. h1, h2, h3 { page-break-after: avoid; }
  360. p3 { orphans: 3; widows: 3; }
  361. img {
  362. max-width: 250px !important;
  363. max-height: 250px !important;
  364. }
  365. nav, aside { display: none; }
  366. }
  367. ul.with_columns {
  368. column-count: 1;
  369. }
  370. @media (min-width: 20em) {
  371. ul.with_columns {
  372. column-count: 2;
  373. }
  374. }
  375. @media (min-width: 50em) {
  376. ul.with_columns {
  377. column-count: 3;
  378. }
  379. }
  380. ul.with_two_columns {
  381. column-count: 1;
  382. }
  383. @media (min-width: 20em) {
  384. ul.with_two_columns {
  385. column-count: 1;
  386. }
  387. }
  388. @media (min-width: 50em) {
  389. ul.with_two_columns {
  390. column-count: 2;
  391. }
  392. }
  393. .gallery {
  394. display: flex;
  395. flex-wrap: wrap;
  396. justify-content: space-around;
  397. }
  398. .gallery figure img {
  399. margin-left: 1rem;
  400. margin-right: 1rem;
  401. }
  402. .gallery figure figcaption {
  403. font-family: 'ConcourseT3'
  404. }
  405. footer {
  406. font-family: 'ConcourseT3';
  407. display: flex;
  408. flex-direction: column;
  409. border-top: 3px solid white;
  410. padding: 4rem 0;
  411. background-color: #07486c;
  412. color: white;
  413. }
  414. footer > * {
  415. max-width: 50rem;
  416. margin: 0 auto;
  417. }
  418. footer a {
  419. color: #f1c40f;
  420. }
  421. footer .avatar {
  422. width: 200px;
  423. height: 200px;
  424. border-radius: 50%;
  425. float: left;
  426. -webkit-shape-outside: circle();
  427. shape-outside: circle();
  428. margin-right: 2rem;
  429. padding: 2px 5px 5px 2px;
  430. background: white;
  431. border-left: 1px solid #f1c40f;
  432. border-top: 1px solid #f1c40f;
  433. border-right: 5px solid #f1c40f;
  434. border-bottom: 5px solid #f1c40f;
  435. }
  436. </style>
  437. <h1>
  438. <span><a id="jumper" href="#jumpto" title="Un peu perdu ?">?</a></span>
  439. Archiving web sites (archive)
  440. <time>Pour la pérennité des contenus liés. Non-indexé, retrait sur simple email.</time>
  441. </h1>
  442. <section>
  443. <article>
  444. <h3><a href="https://lwn.net/Articles/766374/">Source originale du contenu</a></h3>
  445. <p>I recently took a deep dive into web site archival for friends who
  446. were worried about losing control over the hosting of their work
  447. online in the face of poor system administration or hostile
  448. removal.
  449. This makes web site archival an essential instrument in the
  450. toolbox of any system administrator.
  451. As it turns out, some sites are much harder to archive than
  452. others. This article goes through the process of archiving traditional
  453. web sites and shows how it falls short when confronted with the latest
  454. fashions in the single-page applications that are bloating the modern web.</p>
  455. <h4>Converting simple sites</h4>
  456. <p>The days of handcrafted HTML web sites are long gone. Now web sites are
  457. dynamic and built on the fly using the latest JavaScript, PHP, or
  458. Python framework. As a result, the sites are more fragile: a database
  459. crash, spurious upgrade, or unpatched vulnerability might lose data.
  460. In my previous life as web developer, I
  461. had to come to terms with the idea that customers expect web sites to
  462. basically work forever. This expectation matches poorly with "move
  463. fast and break things" attitude of web development. Working with the
  464. <a href="https://drupal.org">Drupal</a> content-management system (CMS) was
  465. particularly
  466. challenging in that regard as major upgrades deliberately break
  467. compatibility with third-party modules, which implies a costly upgrade process that
  468. clients could seldom afford. The solution was to archive those sites:
  469. take a living, dynamic web site and turn it into plain HTML files that
  470. any web server can serve forever. This process is useful for your own dynamic
  471. sites but also for third-party sites that are outside of your control and you might want
  472. to safeguard.</p>
  473. <p>For simple or static sites, the venerable <a href="https://www.gnu.org/software/wget/">Wget</a> program works
  474. well. The incantation to mirror a full web site, however, is byzantine:</p>
  475. <pre>
  476. $ nice wget --mirror --execute robots=off --no-verbose --convert-links \
  477. --backup-converted --page-requisites --adjust-extension \
  478. --base=./ --directory-prefix=./ --span-hosts \
  479. --domains=www.example.com,example.com http://www.example.com/
  480. </pre>
  481. <p>The above downloads the content of the web page, but also crawls
  482. everything within the specified domains. Before you run this against
  483. your favorite site, consider the impact such a crawl might have on the
  484. site. The above command line deliberately ignores
  485. <a href="https://en.wikipedia.org/wiki/Robots_exclusion_standard"><tt>robots.txt</tt></a>
  486. rules, as is now <a href="https://blog.archive.org/2017/04/17/robots-txt-meant-for-search-engines-dont-work-well-for-web-archives/">common practice for archivists</a>,
  487. and hammer the website as fast as it can. Most crawlers have options to
  488. pause between hits and limit bandwidth usage to avoid overwhelming the
  489. target site.
  490. </p>
  491. <p>
  492. The above command will also fetch "page
  493. requisites" like style sheets (CSS), images, and scripts. The
  494. downloaded page contents are modified so that links point to the local
  495. copy as well. Any web server can host the resulting file set, which results
  496. in a static copy of the original web site.</p>
  497. <p>That is, when things go well. Anyone who has ever worked with a computer
  498. knows that things seldom go according to plan; all sorts of
  499. things can make the procedure derail in interesting ways. For example,
  500. it was trendy for a while to have calendar blocks in web sites. A CMS
  501. would generate those on the fly and make crawlers go into an infinite
  502. loop trying to retrieve all of the pages. Crafty archivers can resort to regular expressions
  503. (e.g. Wget has a <code>--reject-regex</code> option) to ignore problematic
  504. resources. Another option, if the administration interface for the
  505. web site is accessible, is to disable calendars, login forms, comment
  506. forms, and other dynamic areas. Once the site becomes static, those
  507. will stop working anyway, so it makes sense to remove such clutter
  508. from the original site as well.</p>
  509. <h4>JavaScript doom</h4>
  510. <p>Unfortunately, some web sites are built with much more than pure
  511. HTML. In single-page sites, for example, the web browser builds the
  512. content itself by executing a small JavaScript program. A simple user
  513. agent like Wget will struggle to reconstruct a meaningful static copy
  514. of those sites as it does not support JavaScript at all. In theory, web
  515. sites should be using <a href="https://en.wikipedia.org/wiki/Progressive_enhancement">progressive
  516. enhancement</a> to have content and
  517. functionality available without JavaScript but those directives are
  518. rarely followed, as anyone using plugins like <a href="https://noscript.net/">NoScript</a> or
  519. <a href="https://github.com/gorhill/uMatrix">uMatrix</a> will confirm.</p>
  520. <p>Traditional archival methods sometimes fail in the dumbest way. When
  521. trying to build an offsite backup of a local newspaper
  522. (<a href="https://pamplemousse.ca/">pamplemousse.ca</a>), I found that
  523. WordPress adds query strings
  524. (e.g. <code>?ver=1.12.4</code>) at the end of JavaScript includes. This confuses
  525. content-type detection in the web servers that serve the archive, which
  526. rely on the file extension
  527. to send the right <code>Content-Type</code> header. When such an archive is
  528. loaded in a
  529. web browser, it fails to load scripts, which breaks dynamic websites.</p>
  530. <p>As the web moves toward using the browser as a virtual machine to run
  531. arbitrary code, archival methods relying on pure HTML parsing need to
  532. adapt. The solution for such problems is to record (and replay) the
  533. HTTP headers delivered by the server during the crawl and indeed
  534. professional archivists use just such an approach.</p>
  535. <h4>Creating and displaying WARC files</h4>
  536. <p>At the <a href="https://archive.org">Internet Archive</a>, Brewster
  537. Kahle and Mike Burner designed
  538. the <a href="http://www.archive.org/web/researcher/ArcFileFormat.php">ARC</a> (for "ARChive") file format in 1996 to provide a way to
  539. aggregate the millions of small files produced by their archival
  540. efforts. The format was eventually standardized as the WARC ("Web
  541. ARChive") <a href="https://iipc.github.io/warc-specifications/">specification</a> that
  542. was released as an ISO standard in 2009 and
  543. revised in 2017. The standardization effort was led by the <a href="https://en.wikipedia.org/wiki/International_Internet_Preservation_Consortium">International Internet
  544. Preservation Consortium</a> (IIPC), which is an "<span>international
  545. organization of libraries and other organizations established to
  546. coordinate efforts to preserve internet content for the future</span>",
  547. according to Wikipedia; it includes members such as the US Library of
  548. Congress and the Internet Archive. The latter uses the WARC format
  549. internally in its Java-based <a href="https://github.com/internetarchive/heritrix3/wiki">Heritrix
  550. crawler</a>.</p>
  551. <p>A WARC file aggregates multiple resources like HTTP headers, file
  552. contents, and other metadata in a single compressed
  553. archive. Conveniently, Wget actually supports the file format with
  554. the <code>--warc</code> parameter. Unfortunately, web browsers cannot render WARC
  555. files directly, so a viewer or some conversion is necessary to access
  556. the archive. The simplest such viewer I have found is <a href="https://github.com/webrecorder/pywb">pywb</a>, a
  557. Python package that runs a simple webserver to offer a
  558. Wayback-Machine-like interface to browse the contents of WARC
  559. files. The following set of commands will render a WARC file on
  560. <tt>http://localhost:8080/</tt>:</p>
  561. <pre>
  562. $ pip install pywb
  563. $ wb-manager init example
  564. $ wb-manager add example crawl.warc.gz
  565. $ wayback
  566. </pre>
  567. <p>This tool was, incidentally, built by the folks behind the
  568. <a href="https://webrecorder.io/">Webrecorder</a> service, which can use
  569. a web browser to save
  570. dynamic page contents.</p>
  571. <p>Unfortunately, pywb has trouble loading WARC files generated by Wget
  572. because it <a href="https://github.com/webrecorder/pywb/issues/294">followed</a> an <a href="https://github.com/iipc/warc-specifications/issues/23">inconsistency in the 1.0
  573. specification</a>, which was <a href="https://github.com/iipc/warc-specifications/pull/24">fixed in the 1.1 specification</a>. Until Wget or
  574. pywb fix those problems, WARC files produced by Wget are not
  575. reliable enough for my uses, so I have looked at other alternatives. A
  576. crawler that got my attention is simply called <a href="https://git.autistici.org/ale/crawl/">crawl</a>. Here is how
  577. it is invoked:</p>
  578. <pre>
  579. $ crawl https://example.com/
  580. </pre>
  581. <p>(It <em>does</em> say "very simple" in the README.) The program does support
  582. some command-line options, but most of its defaults are sane: it will fetch
  583. page requirements from other domains (unless the <code>-exclude-related</code>
  584. flag is used), but does not recurse out of the domain. By default, it
  585. fires up ten parallel connections to the remote site, a setting that
  586. can be changed with the <code>-c</code> flag. But, best of all, the resulting WARC
  587. files load perfectly in pywb.</p>
  588. <h4>Future work and alternatives</h4>
  589. <p>There are plenty more <a href="https://archiveteam.org/index.php?title=The_WARC_Ecosystem">resources</a>
  590. for using WARC files. In
  591. particular, there's a Wget drop-in replacement called <a href="https://github.com/chfoo/wpull">Wpull</a> that is
  592. specifically designed for archiving web sites. It has experimental
  593. support for <a href="http://phantomjs.org/">PhantomJS</a> and <a href="http://rg3.github.io/youtube-dl/">youtube-dl</a> integration that
  594. should allow downloading more complex JavaScript sites and streaming
  595. multimedia, respectively. The software is the basis for an elaborate
  596. archival tool called <a href="https://www.archiveteam.org/index.php?title=ArchiveBot">ArchiveBot</a>,
  597. which is used by the "<span>loose collective of
  598. rogue archivists, programmers, writers and loudmouths</span>" at
  599. <a href="https://archiveteam.org/">ArchiveTeam</a> in its struggle to
  600. "<span>save the history before it's lost
  601. forever</span>". It seems that PhantomJS integration does not work as well as
  602. the team wants, so ArchiveTeam also uses a rag-tag bunch of other
  603. tools to mirror more complex sites. For example, <a href="https://github.com/JustAnotherArchivist/snscrape">snscrape</a> will
  604. crawl a social media profile to generate a list of pages to send into
  605. ArchiveBot. Another tool the team employs is <a href="https://github.com/PromyLOPh/crocoite">crocoite</a>, which uses
  606. the Chrome browser in headless mode to archive JavaScript-heavy sites.</p>
  607. <p>This article would also not be complete without a nod to the
  608. <a href="http://www.httrack.com/">HTTrack</a> project, the "website
  609. copier". Working similarly to Wget,
  610. HTTrack creates local copies of remote web sites but unfortunately does
  611. not support WARC output. Its interactive aspects might be of more
  612. interest to novice users unfamiliar with the command line.
  613. </p>
  614. <p>
  615. In the
  616. same vein, during my research I found a full rewrite of Wget called
  617. <a href="https://gitlab.com/gnuwget/wget2">Wget2</a> that has support for
  618. multi-threaded operation, which might make
  619. it faster than its predecessor. It is <a href="https://gitlab.com/gnuwget/wget2/wikis/home">missing some
  620. features</a> from
  621. Wget, however, most notably reject patterns, WARC output, and FTP support but
  622. adds RSS, DNS caching, and improved TLS support.</p>
  623. <p>Finally, my personal dream for these kinds of tools would be to have
  624. them integrated with my existing bookmark system. I currently keep
  625. interesting links in <a href="https://wallabag.org/">Wallabag</a>, a
  626. self-hosted "read it later"
  627. service designed as a free-software alternative to <a href="https://getpocket.com/">Pocket</a> (now owned by
  628. Mozilla). But Wallabag, by design, creates only a
  629. "readable" version of the article instead of a full copy. In some
  630. cases, the "readable version" is actually <a href="https://github.com/wallabag/wallabag/issues/2825">unreadable</a> and Wallabag
  631. sometimes <a href="https://github.com/wallabag/wallabag/issues/2914">fails to parse the article</a>. Instead, other tools like
  632. <a href="https://pirate.github.io/bookmark-archiver/">bookmark-archiver</a>
  633. or <a href="https://github.com/kanishka-linux/reminiscence">reminiscence</a> save
  634. a screenshot of the
  635. page along with full HTML but, unfortunately, no WARC file that would
  636. allow an even more faithful replay.</p>
  637. <p>The sad truth of my experiences with mirrors and archival is that data
  638. dies. Fortunately,
  639. amateur archivists have tools at their disposal to keep interesting
  640. content alive online. For those who do not want to go through that
  641. trouble, the Internet Archive seems to be here to stay and Archive
  642. Team is obviously <a href="http://iabak.archiveteam.org">working on a
  643. backup of the Internet Archive itself</a>.</p>
  644. </article>
  645. </section>
  646. <nav id="jumpto">
  647. <p>
  648. <a href="/david/blog/">Accueil du blog</a> |
  649. <a href="https://lwn.net/Articles/766374/">Source originale</a> |
  650. <a href="/david/stream/2019/">Accueil du flux</a>
  651. </p>
  652. </nav>
  653. <footer>
  654. <div>
  655. <img src="/static/david/david-larlet-avatar.jpg" loading="lazy" class="avatar" width="200" height="200">
  656. <p>
  657. Bonjour/Hi!
  658. Je suis <a href="/david/" title="Profil public">David&nbsp;Larlet</a>, je vis actuellement à Montréal et j’alimente cet espace depuis 15 ans. <br>
  659. Si tu as apprécié cette lecture, n’hésite pas à poursuivre ton exploration. Par exemple via les <a href="/david/blog/" title="Expériences bienveillantes">réflexions bimestrielles</a>, la <a href="/david/stream/2019/" title="Pensées (dés)articulées">veille hebdomadaire</a> ou en t’abonnant au <a href="/david/log/" title="S’abonner aux publications via RSS">flux RSS</a> (<a href="/david/blog/2019/flux-rss/" title="Tiens c’est quoi un flux RSS ?">so 2005</a>).
  660. </p>
  661. <p>
  662. Je m’intéresse à la place que je peux avoir dans ce monde. En tant qu’humain, en tant que membre d’une famille et en tant qu’associé d’une coopérative. De temps en temps, je fais aussi des <a href="https://github.com/davidbgk" title="Principalement sur Github mais aussi ailleurs">trucs techniques</a>. Et encore plus rarement, <a href="/david/talks/" title="En ce moment je laisse plutôt la place aux autres">j’en parle</a>.
  663. </p>
  664. <p>
  665. Voici quelques articles choisis :
  666. <a href="/david/blog/2019/faire-equipe/" title="Accéder à l’article complet">Faire équipe</a>,
  667. <a href="/david/blog/2018/bivouac-automnal/" title="Accéder à l’article complet">Bivouac automnal</a>,
  668. <a href="/david/blog/2018/commodite-effondrement/" title="Accéder à l’article complet">Commodité et effondrement</a>,
  669. <a href="/david/blog/2017/donnees-communs/" title="Accéder à l’article complet">Des données aux communs</a>,
  670. <a href="/david/blog/2016/accompagner-enfant/" title="Accéder à l’article complet">Accompagner un enfant</a>,
  671. <a href="/david/blog/2016/senior-developer/" title="Accéder à l’article complet">Senior developer</a>,
  672. <a href="/david/blog/2016/illusion-sociale/" title="Accéder à l’article complet">L’illusion sociale</a>,
  673. <a href="/david/blog/2016/instantane-scopyleft/" title="Accéder à l’article complet">Instantané Scopyleft</a>,
  674. <a href="/david/blog/2016/enseigner-web/" title="Accéder à l’article complet">Enseigner le Web</a>,
  675. <a href="/david/blog/2016/simplicite-defaut/" title="Accéder à l’article complet">Simplicité par défaut</a>,
  676. <a href="/david/blog/2016/minimalisme-esthetique/" title="Accéder à l’article complet">Minimalisme et esthétique</a>,
  677. <a href="/david/blog/2014/un-web-omni-present/" title="Accéder à l’article complet">Un web omni-présent</a>,
  678. <a href="/david/blog/2014/manifeste-developpeur/" title="Accéder à l’article complet">Manifeste de développeur</a>,
  679. <a href="/david/blog/2013/confort-convivialite/" title="Accéder à l’article complet">Confort et convivialité</a>,
  680. <a href="/david/blog/2013/testament-numerique/" title="Accéder à l’article complet">Testament numérique</a>,
  681. et <a href="/david/blog/" title="Accéder aux archives">bien d’autres…</a>
  682. </p>
  683. <p>
  684. On peut <a href="mailto:david%40larlet.fr" title="Envoyer un courriel">échanger par courriel</a>. Si éventuellement tu souhaites que l’on travaille ensemble, tu devrais commencer par consulter le <a href="http://larlet.com">profil dédié à mon activité professionnelle</a> et/ou contacter directement <a href="http://scopyleft.fr/">scopyleft</a>, la <abbr title="Société coopérative et participative">SCOP</abbr> dont je fais partie depuis six ans. Je recommande au préalable de lire <a href="/david/blog/2018/cout-site/" title="Attention ce qui va suivre peut vous choquer">combien coûte un site</a> et pourquoi je suis plutôt favorable à une <a href="/david/pro/devis/" title="Discutons-en !">non-demande de devis</a>.
  685. </p>
  686. <p>
  687. Je ne traque pas ta navigation mais mon
  688. <abbr title="Alwaysdata, 62 rue Tiquetonne 75002 Paris, +33.184162340">hébergeur</abbr>
  689. conserve des logs d’accès.
  690. </p>
  691. </div>
  692. </footer>
  693. <script type="text/javascript">
  694. ;(_ => {
  695. const jumper = document.getElementById('jumper')
  696. jumper.addEventListener('click', e => {
  697. e.preventDefault()
  698. const anchor = e.target.getAttribute('href')
  699. const targetEl = document.getElementById(anchor.substring(1))
  700. targetEl.scrollIntoView({behavior: 'smooth'})
  701. })
  702. })()
  703. </script>