A place to cache linked articles (think custom and personal wayback machine)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

index.html 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. <!doctype html><!-- This is a valid HTML5 document. -->
  2. <!-- Screen readers, SEO, extensions and so on. -->
  3. <html lang="fr">
  4. <!-- Has to be within the first 1024 bytes, hence before the <title>
  5. See: https://www.w3.org/TR/2012/CR-html5-20121217/document-metadata.html#charset -->
  6. <meta charset="utf-8">
  7. <!-- Why no `X-UA-Compatible` meta: https://stackoverflow.com/a/6771584 -->
  8. <!-- The viewport meta is quite crowded and we are responsible for that.
  9. See: https://codepen.io/tigt/post/meta-viewport-for-2015 -->
  10. <meta name="viewport" content="width=device-width,initial-scale=1">
  11. <!-- Required to make a valid HTML5 document. -->
  12. <title>Reverse Engineering Source Code of the Biontech Pfizer Vaccine: Part 2 (archive) — David Larlet</title>
  13. <meta name="description" content="Publication mise en cache pour en conserver une trace.">
  14. <!-- That good ol' feed, subscribe :). -->
  15. <link rel="alternate" type="application/atom+xml" title="Feed" href="/david/log/">
  16. <!-- Generated from https://realfavicongenerator.net/ such a mess. -->
  17. <link rel="apple-touch-icon" sizes="180x180" href="/static/david/icons2/apple-touch-icon.png">
  18. <link rel="icon" type="image/png" sizes="32x32" href="/static/david/icons2/favicon-32x32.png">
  19. <link rel="icon" type="image/png" sizes="16x16" href="/static/david/icons2/favicon-16x16.png">
  20. <link rel="manifest" href="/static/david/icons2/site.webmanifest">
  21. <link rel="mask-icon" href="/static/david/icons2/safari-pinned-tab.svg" color="#07486c">
  22. <link rel="shortcut icon" href="/static/david/icons2/favicon.ico">
  23. <meta name="msapplication-TileColor" content="#f0f0ea">
  24. <meta name="msapplication-config" content="/static/david/icons2/browserconfig.xml">
  25. <meta name="theme-color" content="#f0f0ea">
  26. <!-- Documented, feel free to shoot an email. -->
  27. <link rel="stylesheet" href="/static/david/css/style_2020-06-19.css">
  28. <!-- See https://www.zachleat.com/web/comprehensive-webfonts/ for the trade-off. -->
  29. <link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_regular.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
  30. <link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_bold.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
  31. <link rel="preload" href="/static/david/css/fonts/triplicate_t4_poly_italic.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" crossorigin>
  32. <link rel="preload" href="/static/david/css/fonts/triplicate_t3_regular.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
  33. <link rel="preload" href="/static/david/css/fonts/triplicate_t3_bold.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
  34. <link rel="preload" href="/static/david/css/fonts/triplicate_t3_italic.woff2" as="font" type="font/woff2" media="(prefers-color-scheme: dark)" crossorigin>
  35. <script>
  36. function toggleTheme(themeName) {
  37. document.documentElement.classList.toggle(
  38. 'forced-dark',
  39. themeName === 'dark'
  40. )
  41. document.documentElement.classList.toggle(
  42. 'forced-light',
  43. themeName === 'light'
  44. )
  45. }
  46. const selectedTheme = localStorage.getItem('theme')
  47. if (selectedTheme !== 'undefined') {
  48. toggleTheme(selectedTheme)
  49. }
  50. </script>
  51. <meta name="robots" content="noindex, nofollow">
  52. <meta content="origin-when-cross-origin" name="referrer">
  53. <!-- Canonical URL for SEO purposes -->
  54. <link rel="canonical" href="https://berthub.eu/articles/posts/part-2-reverse-engineering-source-code-of-the-biontech-pfizer-vaccine/">
  55. <body class="remarkdown h1-underline h2-underline h3-underline hr-center ul-star pre-tick">
  56. <article>
  57. <header>
  58. <h1>Reverse Engineering Source Code of the Biontech Pfizer Vaccine: Part 2</h1>
  59. </header>
  60. <nav>
  61. <p class="center">
  62. <a href="/david/" title="Aller à l’accueil">🏠</a> •
  63. <a href="https://berthub.eu/articles/posts/part-2-reverse-engineering-source-code-of-the-biontech-pfizer-vaccine/" title="Lien vers le contenu original">Source originale</a>
  64. </p>
  65. </nav>
  66. <hr>
  67. <p>All BNT162b2 vaccine data on this page is sourced from this <a href="https://mednet-communities.net/inn/db/media/docs/11889.doc" target="_blank">World Health
  68. Organization
  69. document</a>.</p>
  70. <blockquote>
  71. <p>This is a living page, shared already so people can get going! But
  72. check back frequently for updates.</p>
  73. </blockquote>
  74. <p><em>Translation</em>:
  75. <a href="https://renaudguerin.net/posts/partie-2-explorons-le-code-source-du-vaccin-biontech-pfizer/" target="_blank">Français</a>
  76. / <a href="https://msakai.github.io/bnt162b2/part-2-reverse-engineering-source-code-of-the-biontech-pfizer-vaccine.ja/" target="_blank">日本語</a></p>
  77. <p>In short: the vaccine mRNA has been optimized by the manufacturer by
  78. changing bits of RNA from (say) <code>UUU</code> to <code>UUC</code>, and people would like to
  79. understand the logic behind these changes. This challenge is quite close to what
  80. cryptologists and reverse engineering people encounter regularly. On this
  81. page, you’ll find all the details you need to get cracking to reverse
  82. engineer just HOW the vaccine has been optimized.</p>
  83. <p>I thought this would just be a fun puzzle, but I have just been informed that
  84. figuring out the optimization procedure &amp; documenting it is tremendously
  85. important for researchers around the world, as this would help them design
  86. code for proteins and vaccines.</p>
  87. <p>So, if you want to help vaccine research, do read on!</p>
  88. <h2 id="the-leader-board">The leader board</h2>
  89. <p>Here are the current best entrants to the optimization algorithm (average of 20 runs):</p>
  90. <h2 id="biontech">BioNTech</h2>
  91. <p>We should all be very grateful that BioNTech has shared this data with us.
  92. And of course we should also be grateful to the many many researchers and
  93. lab workers that worked for decades to bring the state of the art to the
  94. point that such a vaccine could be developed. It is marvelous.</p>
  95. <p>Because it is so marvelous, I want to understand everything about the
  96. vaccine. I wrote a page <a href="https://berthub.eu/articles/posts/reverse-engineering-source-code-of-the-biontech-pfizer-vaccine/" target="_blank">Reverse Engineering the source code of the BioNTech/Pfizer SARS-CoV-2
  97. Vaccine</a>
  98. that describes in some detail what is in the mRNA of the vaccine. It helps
  99. to read this page before continuing, I promise you it will be interesting.</p>
  100. <p>The post left open some questions however, and this is where it gets
  101. fascinating.</p>
  102. <h2 id="the-codon-optimization">The codon optimization</h2>
  103. <p>The vaccine contains RNA code for a very <em>slightly</em> modified copy of the
  104. SARS-CoV-2 S protein.</p>
  105. <p>The RNA code of the vaccine itself however is <em>highly</em> modified from the viral original!
  106. This has been done by the manufacturer, based on their understanding of
  107. nature.</p>
  108. <p>And from what we understand, these modifications make the vaccine <strong>much
  109. much more</strong> effective. It would be a lot of fun to understand these
  110. modifications. It might for example explain why the Moderna vaccine needs
  111. 100 micrograms and the BioNTech vaccine only 30 micrograms.</p>
  112. <p>Here is the beginning of the S protein in both the virus and the BNT162b2
  113. vaccine RNA code. Exclamation marks denote differences.</p>
  114. <pre><code>Virus: AUG UUU GUU UUU CUU GUU UUA UUG CCA CUA GUC UCU AGU CAG UGU GUU
  115. Vaccine: AUG UUC GUG UUC CUG GUG CUG CUG CCU CUG GUG UCC AGC CAG UGU GUG
  116. ! ! ! ! ! ! ! ! ! ! ! ! !
  117. </code></pre>
  118. <p>RNA is a string (literally) of RNA characters, <code>A</code>, <code>C</code>, <code>G</code> and <code>U</code>. There is no
  119. physical framing on there, but it makes sense to analyse it in groups of
  120. three.</p>
  121. <p>Each group (called a codon) maps to an amino acid (denoted by a capital
  122. letter). A string of amino acids is a protein. Here is what that looks
  123. like:</p>
  124. <pre><code>Virus: AUG UUU GUU UUU CUU GUU UUA UUG CCA CUA GUC UCU AGU CAG UGU GUU
  125. M F V F L V L L P L V S S Q C V
  126. Vaccine: AUG UUC GUG UUC CUG GUG CUG CUG CCU CUG GUG UCC AGC CAG UGU GUG
  127. ! ! ! ! ! ! ! ! ! ! ! ! !
  128. </code></pre>
  129. <p>Here we can see that while the codons are different, the amino acid version
  130. is the same. There are 4*4*4 codons but only 20 amino acids. This means you
  131. can typically change every codon into one of two others, and still code for
  132. the same amino acid.</p>
  133. <p>So in the second codon, <code>UUU</code> was changed to <code>UUC</code>. This is a net addition
  134. of one ‘C’ to the vaccine. The third codon changed from <code>GUU</code> to <code>GUG</code>, which is
  135. a net addition of one <code>G</code>.</p>
  136. <p><strong>It is known that a higher fraction of <code>G</code> and <code>C</code> characters improves the
  137. efficiency of an mRNA vaccine</strong>.</p>
  138. <p>Now, if that was all there was to it, this could be the end of this page.
  139. “The algorithm is change codons so we get more G and C in there”. But then
  140. we meet the 9th codon which changes <code>CCA</code> to <code>CCU</code>.</p>
  141. <p>Throughout the ~4000 characters of the vaccine, this happens many times.</p>
  142. <h2 id="our-challenge">Our challenge</h2>
  143. <p>The goal is: find an algorithm that modifies the ‘wild type’ RNA code into
  144. the BNT162b2 one. Because everyone would like to understand how to turn
  145. viral RNA into an effective vaccine. The algorithm does not need to
  146. reproduce the <em>exact</em> RNA code of course, but it would be super nice if it
  147. came up with something very similar, while also being brief.</p>
  148. <p>To help you, I have provided the data in a number of forms, as described on
  149. <a href="https://github.com/berthubert/bnt162b2" target="_blank">the GitHub page</a>.</p>
  150. <blockquote>
  151. <p>Note that in these files the <code>U</code> mentioned above appears as a <code>T</code>. <code>U</code> and
  152. <code>T</code> are the RNA and DNA manifestations of the same information.</p>
  153. </blockquote>
  154. <p>The easiest place to start might be the
  155. ‘<a href="https://github.com/berthubert/bnt162b2/blob/master/side-by-side.csv" target="_blank">side-by-side.csv</a>‘
  156. file. This lists the original and modified version of each codon, side by
  157. side:</p>
  158. <pre><code>abspos,codonOrig,codonVaccine
  159. 0,ATG,ATG
  160. 3,TTT,TTC
  161. 6,GTT,GTG
  162. ...
  163. 3813,TAC,TAC
  164. 3816,ACA,ACA
  165. 3819,TAA,TGA
  166. </code></pre>
  167. <p>There is also an equivalency table that shows wich codons can be
  168. interchanged without changing the amino acid output. Please find this in
  169. <a href="https://github.com/berthubert/bnt162b2/blob/master/codon-table-grouped.csv" target="_blank">codon-table-grouped.csv</a>.
  170. There is also a visual version
  171. <a href="https://en.wikipedia.org/wiki/DNA_and_RNA_codon_tables#Standard_DNA_codon_table" target="_blank">here</a>.</p>
  172. <h2 id="a-sample-algorithm">A sample algorithm</h2>
  173. <p>On the <a href="https://github.com/berthubert/bnt162b2" target="_blank">GitHub repository</a> you can
  174. find
  175. <a href="https://github.com/berthubert/bnt162b2/blob/master/3rd-gc.go" target="_blank">3rd-gc.go</a>
  176. (and
  177. <a href="https://github.com/berthubert/bnt162b2/blob/master/3rd-gc.py" target="_blank">3rd-gc.py</a>).</p>
  178. <p>These implement a simple strategy that works like this:</p>
  179. <ul>
  180. <li>If a virus codon already ended on G or C, copy it to the vaccine mRNA</li>
  181. <li>If not, replace last nucleotide in codon by a G, see if the amino acid
  182. still matches, if so, copy to the vaccine mRNA</li>
  183. <li>Try the same with a C</li>
  184. <li>Otherwise copy as is</li>
  185. </ul>
  186. <p>Or in <code>golang</code>:</p>
  187. <pre><code>// base case, don't do anything
  188. our = vir
  189. // don't do anything if codon ends on G or C already
  190. if(vir[2] == 'G' || vir[2] =='C') {
  191. fmt.Printf("Codon ended on G or C already, not doing anything.")
  192. } else {
  193. prop = vir[:2]+"G"
  194. fmt.Printf("Attempting G substitution, new candidate '%s'. ", prop)
  195. if(c2s[vir] == c2s[prop]) {
  196. fmt.Printf("Amino acid still the same, done!")
  197. our = prop
  198. } else {
  199. fmt.Printf("Oops, amino acid changed. Trying C, new candidate '%s'. ", prop)
  200. prop = vir[:2]+"C"
  201. if(c2s[vir] == c2s[prop]) {
  202. fmt.Printf("Amino acid still the same, done!")
  203. our=prop
  204. }
  205. }
  206. }
  207. </code></pre>
  208. <p>This achieves a rather poor 53.1% match with the BioNTech RNA vaccine, but
  209. it is a start.</p>
  210. <p>When you design your algorithm, be sure to only base your choices on the
  211. virus RNA. Do not peek into the BioNTech RNA!</p>
  212. <p>If you have achieved a score beyond 53.1% please email a link to your code
  213. to bert@hubertnet.nl (or <a href="https://twitter.com/PowerDNS_Bert" target="_blank">@PowerDNS_Bert</a>
  214. and I’ll put it on the leader board at the top of this page!</p>
  215. <h2 id="things-that-will-help">Things that will help</h2>
  216. <p>As with every form of reverse engineering or cryptanalysis, it helps to
  217. understand what we are looking at.</p>
  218. <h2 id="gc-ratio">GC ratio</h2>
  219. <p>We know that one goal of the ‘codon optimization’ is to get more <code>C</code>s and
  220. <code>G</code>s into the vaccine version of the RNA. However, there is also a limit to
  221. that. In DNA, which is also used to manufacture the vaccine, <code>G</code> and <code>C</code>
  222. bind together strongly, to the point that if you put too many of these
  223. ‘nucleotides’ in there, the DNA will no longer be replicated efficiently.</p>
  224. <p>So some modifications may actually happen to manage <em>down</em> the GC percentage of a
  225. stretch of DNA if it was getting too high.</p>
  226. <p>I <a href="https://twitter.com/PowerDNS_Bert/status/1344036143961169920" target="_blank">tweeted about this</a> earlier.</p>
  227. <h2 id="codon-optimization">Codon optimization</h2>
  228. <p>Some codons are rare in human DNA, or in certain cells. It may be that some
  229. codons are replaced by other ones simply because they are more frequently
  230. used by some cells.</p>
  231. <p>I <a href="https://twitter.com/PowerDNS_Bert/status/1344400081802448897" target="_blank">tweeted about this</a>
  232. earlier.</p>
  233. <h2 id="rna-folding">RNA folding</h2>
  234. <p>We’ve been looking at codons up to here. The RNA itself however does not
  235. know about codons, there are no markers that say where a codon begins and
  236. ends. The first codon on a protein however is always ATG (or AUG in RNA).</p>
  237. <p>RNA curls up into a shape. This shape might help evade the immune system or
  238. it might improve translation into amino acids. This only depends on the
  239. sequence of RNA nucleotides and not on specific codons.</p>
  240. <p>You can submit RNA sequences to <a href="http://rna.tbi.univie.ac.at/cgi-bin/RNAWebSuite/RNAfold.cgi" target="_blank">this server of the Institute for
  241. Theoretical Chemistry at the University of
  242. Vienna</a> and it
  243. will fold RNA for you. This is a very advanced server that does meticulous
  244. calculations.</p>
  245. <p>This <a href="https://en.wikipedia.org/wiki/Nucleic_acid_structure_prediction" target="_blank">Wikipedia
  246. page</a>
  247. describes how this works.</p>
  248. <p>It may be that some optimizations improve folding.</p>
  249. <p>I am also told that this paper by Moderna (another mRNA vaccine
  250. manufacturer) may be relevant:
  251. <a href="https://www.pnas.org/content/116/48/24075" target="_blank">mRNA structure regulates protein expression through changes in functional
  252. half-life</a>.</p>
  253. </article>
  254. <hr>
  255. <footer>
  256. <p>
  257. <a href="/david/" title="Aller à l’accueil">🏠</a> •
  258. <a href="/david/log/" title="Accès au flux RSS">🤖</a> •
  259. <a href="http://larlet.com" title="Go to my English profile" data-instant>🇨🇦</a> •
  260. <a href="mailto:david%40larlet.fr" title="Envoyer un courriel">📮</a> •
  261. <abbr title="Hébergeur : Alwaysdata, 62 rue Tiquetonne 75002 Paris, +33184162340">🧚</abbr>
  262. </p>
  263. <template id="theme-selector">
  264. <form>
  265. <fieldset>
  266. <legend>Thème</legend>
  267. <label>
  268. <input type="radio" value="auto" name="chosen-color-scheme" checked> Auto
  269. </label>
  270. <label>
  271. <input type="radio" value="dark" name="chosen-color-scheme"> Foncé
  272. </label>
  273. <label>
  274. <input type="radio" value="light" name="chosen-color-scheme"> Clair
  275. </label>
  276. </fieldset>
  277. </form>
  278. </template>
  279. </footer>
  280. <script>
  281. function loadThemeForm(templateName) {
  282. const themeSelectorTemplate = document.querySelector(templateName)
  283. const form = themeSelectorTemplate.content.firstElementChild
  284. themeSelectorTemplate.replaceWith(form)
  285. form.addEventListener('change', (e) => {
  286. const chosenColorScheme = e.target.value
  287. localStorage.setItem('theme', chosenColorScheme)
  288. toggleTheme(chosenColorScheme)
  289. })
  290. const selectedTheme = localStorage.getItem('theme')
  291. if (selectedTheme && selectedTheme !== 'undefined') {
  292. form.querySelector(`[value="${selectedTheme}"]`).checked = true
  293. }
  294. }
  295. const prefersColorSchemeDark = '(prefers-color-scheme: dark)'
  296. window.addEventListener('load', () => {
  297. let hasDarkRules = false
  298. for (const styleSheet of Array.from(document.styleSheets)) {
  299. let mediaRules = []
  300. for (const cssRule of styleSheet.cssRules) {
  301. if (cssRule.type !== CSSRule.MEDIA_RULE) {
  302. continue
  303. }
  304. // WARNING: Safari does not have/supports `conditionText`.
  305. if (cssRule.conditionText) {
  306. if (cssRule.conditionText !== prefersColorSchemeDark) {
  307. continue
  308. }
  309. } else {
  310. if (cssRule.cssText.startsWith(prefersColorSchemeDark)) {
  311. continue
  312. }
  313. }
  314. mediaRules = mediaRules.concat(Array.from(cssRule.cssRules))
  315. }
  316. // WARNING: do not try to insert a Rule to a styleSheet you are
  317. // currently iterating on, otherwise the browser will be stuck
  318. // in a infinite loop…
  319. for (const mediaRule of mediaRules) {
  320. styleSheet.insertRule(mediaRule.cssText)
  321. hasDarkRules = true
  322. }
  323. }
  324. if (hasDarkRules) {
  325. loadThemeForm('#theme-selector')
  326. }
  327. })
  328. </script>
  329. </body>
  330. </html>