Update, remove generators etc.

This commit is contained in:
Sarah Jamie Lewis 2021-08-10 11:34:54 -07:00
parent 827178f330
commit 409b9400f7
14 changed files with 359 additions and 249 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/ssb.iml" filepath="$PROJECT_DIR$/.idea/ssb.iml" />
</modules>
</component>
</project>

9
.idea/ssb.iml Normal file
View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="Go" enabled="true" />
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

View File

@ -1,6 +1,5 @@
<footer>
Created with <a href="https://github.com/maciejzj/ssb">ssb</a> a simple
static blogger.
Sarah Jamie Lewis
</footer>
</body>
</html>

View File

@ -3,7 +3,7 @@
<head>
<meta charset="utf-8">
<title>Title</title>
<title>$TITLE | pseudorandom</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" type="text/css" href="styles.css">
@ -21,9 +21,10 @@
<body>
<header>
<nav>
<strong>Title</strong>
<strong>pseudorandom</strong>
<a href="./index.html">home</a>
<a href="mailto:">email</a>
<a href="mailto:sarah@openprivacy.ca">email</a>
<a href="cwtch:icyt7rvdsdci42h6si2ibtwucdmjrlcb2ezkecuagtquiiflbkxf2cqd">cwtch</a>
</nav>
</header>
<article>

View File

@ -1,50 +0,0 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset="utf-8">
<title>Title</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" type="text/css" href="styles.css">
<link rel="stylesheet" href="/katex/katex.min.css" integrity="sha384-RZU/ijkSsFbcmivfdRBQDtwuwVqK7GMOw6IMvKyeWL2K5UAlyp6WonmB8m7Jd0Hn" crossorigin="anonymous">
<!-- The loading of KaTeX is deferred to speed up page rendering -->
<script defer src="/katex//katex.min.js" integrity="sha384-pK1WpvzWVBQiP0/GjnvRxV4mOb0oxFuyRxJlk6vVw146n3egcN5C925NCP7a7BY8" crossorigin="anonymous"></script>
<!-- To automatically render math in text elements, include the auto-render extension: -->
<script defer src="/katex/auto-render.min.js" integrity="sha384-vZTG03m+2yp6N6BNi5iM4rW4oIwk5DfcNdFfxkk9ZWpDriOkXX8voJBFrAO7MpVl" crossorigin="anonymous"
onload="renderMathInElement(document.body);"></script>
</head>
<body>
<header>
<nav>
<strong>Title</strong>
<a href="./index.html">home</a>
<a href="mailto:">email</a>
</nav>
</header>
<article>
<h1 id="hello-world">Hello World</h1>
This is a test. I like writing tests….
<p class="sidenote">
This is another test
</p>
<p>And there we go</p>
<p>Regardless I will probably end up writing some amount of work here…</p>
<p>This is some more words</p>
<p><br /><span class="math display"><em>x</em>=<em>π</em></span><br /></p>
<p><br /><span class="math display"><em>α</em><em>β</em><em>γ</em><em>ρ</em><em>σ</em><em>δ</em><em>ϵ</em></span><br /></p>
<p><br /><span class="math display"><em>x</em><sup><em>n</em></sup>+<em>y</em><sup><em>n</em></sup>=<em>z</em><sup><em>n</em></sup></span><br /></p>
</article>
<p>2021-08-09 <a href="hello_world.html">Hello World</a><br></p>
<footer>
Created with <a href="https://github.com/maciejzj/ssb">ssb</a> a simple
static blogger.
</footer>
</body>
</html>

View File

@ -3,7 +3,7 @@
<head>
<meta charset="utf-8">
<title>Title</title>
<title>Welcome! | pseudorandom</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" type="text/css" href="styles.css">
@ -21,20 +21,23 @@
<body>
<header>
<nav>
<strong>Title</strong>
<strong>pseudorandom</strong>
<a href="./index.html">home</a>
<a href="mailto:">email</a>
<a href="mailto:sarah@openprivacy.ca">email</a>
<a href="cwtch:icyt7rvdsdci42h6si2ibtwucdmjrlcb2ezkecuagtquiiflbkxf2cqd">cwtch</a>
</nav>
</header>
<article>
<h1 id="welcome">Welcome!</h1>
</article>
<p>2021-08-09 <a href="hello_world.html">Hello World</a><br></p>
<hr/>
<h2>
Recent Articles
</h2>
<p>2021-08-10 <a href="obfuscated_apples.html">Obfuscated Apples</a><br></p>
<footer>
Created with <a href="https://github.com/maciejzj/ssb">ssb</a> a simple
static blogger.
Sarah Jamie Lewis
</footer>
</body>
</html>

View File

@ -1 +1,2 @@
# Welcome!

125
obfuscated_apples.html Normal file
View File

@ -0,0 +1,125 @@
<!DOCTYPE html>
<html lang=en>
<head>
<meta charset="utf-8">
<title>Obfuscated Apples | pseudorandom</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" type="text/css" href="styles.css">
<link rel="stylesheet" href="/katex/katex.min.css" integrity="sha384-RZU/ijkSsFbcmivfdRBQDtwuwVqK7GMOw6IMvKyeWL2K5UAlyp6WonmB8m7Jd0Hn" crossorigin="anonymous">
<!-- The loading of KaTeX is deferred to speed up page rendering -->
<script defer src="/katex//katex.min.js" integrity="sha384-pK1WpvzWVBQiP0/GjnvRxV4mOb0oxFuyRxJlk6vVw146n3egcN5C925NCP7a7BY8" crossorigin="anonymous"></script>
<!-- To automatically render math in text elements, include the auto-render extension: -->
<script defer src="/katex/auto-render.min.js" integrity="sha384-vZTG03m+2yp6N6BNi5iM4rW4oIwk5DfcNdFfxkk9ZWpDriOkXX8voJBFrAO7MpVl" crossorigin="anonymous"
onload="renderMathInElement(document.body);"></script>
</head>
<body>
<header>
<nav>
<strong>pseudorandom</strong>
<a href="./index.html">home</a>
<a href="mailto:sarah@openprivacy.ca">email</a>
<a href="cwtch:icyt7rvdsdci42h6si2ibtwucdmjrlcb2ezkecuagtquiiflbkxf2cqd">cwtch</a>
</nav>
</header>
<article>
<h1 id="obfuscated-apples">Obfuscated Apples</h1>
Generating noise in a way which is indistinguishable from real signal is a ridiculously hard problem. Obfuscation does not hide signal, it only adds noise.
<p class="sidenote">
if you take anything away from this article please let it be this fact.
</p>
<p>Sadly, most people operate under the assumption that adding noise to a system is all that it takes to make the signal unrecoverable. This logic is very clearly in operation in Apples new proposal of on-device scanning <a class="sidenote" href="https://www.apple.com/child-safety/pdf/CSAM_Detection_Technical_Summary.pdf">technical summary</a> which,among other things, proposes generating <em>synthetic</em> matches to hide the true number of <em>real</em> matches in the system.</p>
I want to take this opportunity to break down how this kind of obfuscation can be defeated even when not considering the fact that it is Apple themselves who are charged with generating and maintaining the safety parameters of the system.
<p class="sidenote">
i.e. even if we treat the people who design and build this system as honest adversaries.
</p>
<h2 id="sketching-a-basic-scheme">Sketching a Basic Scheme</h2>
<p>For the sake of clarity I will omit the technical details of the private set intersection protocol, and the threshold scheme, and we will operate under the assumption that both are cryptographically secure. We will also, for now, assume that the database of images to compare is incorruptible.</p>
<p>At the heart of system is a (mostly) black box that contains a perceptual hash function that analyzes an images and spits out a hash, this hash is then compared against a database of known hashes and if a match is found the system reports <code>true</code> and otherwise reports <code>false</code>.</p>
<p>Throughout this article I will use <strong>match</strong> when talking about both true and false positives.</p>
<p>Without question, the server learns a match occurred on the phone during the PSI protocol.</p>
<blockquote class="sidenote">
“The output of PSI protocol on the server reveals whether there is a match or not” - Apple Technical Summary
</blockquote>
<p>When a certain threshold of real matches is reached, the server gains the ability to decrypt all real matches, a human reviews them, and a determination is made.</p>
As presented the system above has one major flaw
<p class="sidenote">
(besides the gross nature of co-opting a personal device as a surveillance system)
</p>
<p>: the server learns how many matches the device has reported prior to being able to decrypt those matches. This is obviously very important metadata in the context of the system and as such needs to be protected.</p>
<p>To do so Apple rely on the invocation of “Synthetic Vouchers” with the following property:</p>
<ul>
<li>“The probability that a device uploads a synthetic voucher instead of a real voucher for an image is calibrated to ensure the total number of synthetics is of the same order of magnitude as the threshold”</li>
</ul>
<h2 id="additional-metadata">Additional Metadata</h2>
<p>So, that is it right? Problem averted? Lets not be too hasty…</p>
<p>From what we know so far there are a few interest parameters in this system, a threshold <span class="math inline"><em>t</em></span>, a probability of generating a synthetic match <span class="math inline"><em>P</em>(<code>synthetic</code>)</span> and the probability of a false positive match <span class="math inline"><em>P</em>(<code>falsepositive</code>)</span>.</p>
<p>We also know that Apple has constructed these parameters such that the probability of an account being flagged for human review (i.e. when number of matches <span class="math inline"><em>M</em>&gt;<em>t</em></span> is <span class="math inline"><em>P</em>(<em>f</em><em>l</em><em>a</em><em>g</em>)=1<em>e</em><sup>12</sup></span> or one in one trillion.</p>
<blockquote class="sidenote">
“The threshold is selected to provide an extremely low (1 in 1 trillion) probability of incorrectly flagging a given account.” - Apple Technical Summary
</blockquote>
<p>From these parameters we can start to derive some additional metadata that exists within the system:</p>
<h3 id="matches-over-threshold-without-decryption">Matches over Threshold without Decryption</h3>
<p>One of the most obvious sources of distinguishing information is built explicitly into the design. The threshold scheme as proposed requires <span class="math inline"><em>t</em></span> real matches in order to decrypt the inner envelopes containing the matches images.</p>
<p>Taking the number of observed matches, <span class="math inline"><em>o</em></span>, the system is able to derive a value <span class="math inline"><em>t</em>=<em>o</em>/<em>t</em></span> or, the number of observed matches divided by the threshold.</p>
<p>Every time a new match is observed and <span class="math inline"><em>t</em></span> is over the threshold the system learns additional information regarding the makeup of previous matches.</p>
<p>For example, if we set the <span class="math inline"><em>t</em>=10</span>, then after the system observes <span class="math inline">10</span> matches (made up of a number of an unknown number of real or synthetic matches) then every additional match until decryption can be used to derive information about the previous matches. If the next match to arrive does not allow decryption then the server can derive that there are at least 2 synthetic matches in the bucket. This confidence increases with every observation that does not result in decryption.</p>
<p>Further we know that synthetic matches happen with probability <span class="math inline"><em>P</em>(<code>synthetic</code>)</span> and as such the <strong>rate</strong> at which observed matches in combination with the above allows us to define a probability <span class="math inline"><em>P</em>(<code>match</code>)</span> given that the observation did not result in decryption, and the known probability of a device generating a synthetic value <span class="math inline"><em>P</em>(<code>synthetic</code>)</span>.</p>
<p><br /><span class="math display">$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{match}|\texttt{real}) \times P(\texttt{observation})}{P(\texttt{synthetic})}$$</span><br /></p>
<p>We know that all matches will result in an observation and so…</p>
<p><br /><span class="math display">$$P(\texttt{match}| \texttt{observation}) = \frac{1 \times P(\texttt{observation})}{P(\texttt{synthetic})}$$</span><br /></p>
<p>Or more simply:</p>
<p><br /><span class="math display">$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{observation})}{P(\texttt{synthetic})}$$</span><br /></p>
<p>Finally, we can state the the probability of an observation (say over the period of a day) is the probability of a real match on any given day plus the probability of a synthetic match on any given day:</p>
<p><br /><span class="math display">$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{synthetic})+P(\texttt{match})}{P(\texttt{synthetic})}$$</span><br /></p>
<p>Given that the probability of a synthetic match is defined by Apple, the only unknown in the system is the probability of a match.</p>
While this may start as an unknown
<p class="sidenote">
setting aside the fact that Apple already has enough data to derive this themselves
</p>
<p>, Apple quickly generates a large amount of data relating to when new observations are made. Since people are different in the ways these take and store photos and live in different parts of the world, the exact probability of them triggering a check is dependent on them.</p>
<p>It is also obvious at this point that if <span class="math inline"><em>P</em>(<code>synthetic</code>)</span> is generated <em>naively</em> <span class="sidenote">(<em>naively</em> as in not account specific)</span> then <em>any</em> additional information known about a particular account e.g. the number of photos they are likely to store over a period of time is enough for <strong>anyone</strong> who knows <span class="math inline"><em>P</em>(<code>synthetic</code>)</span> to derive <span class="math inline"><em>P</em>(<code>match</code>)</span>. <span class="sidenote">We will come back to this point shortly</span></p>
<p>The secrecy then is highly dependent on both Apple never deriving <span class="math inline"><em>P</em>(<code>observation</code>)</span> for themselves <em>and</em> on Apple generating a distinct <span class="math inline"><em>P</em>(<code>synthetic</code>)</span> for each account. Or rather, the privacy of one of the most sensitive aspects of this system requires Apple both collecting no information on accounts, and also on Apple knowing enough about accounts to derive the parameters necessary to keep the information private.</p>
<hr/>
<p>It is actually worse than that through. Remember that Apple has stated that the actual probability of observing <span class="math inline"><em>M</em>&gt;<em>t</em></span> false positive matches is 1 in a trillion.</p>
<p>We can work backwards from that number to derive <span class="math inline"><em>P</em>(<code>falsepositive</code>)</span>:</p>
<p><br /><span class="math display">$$P(\texttt{flag}) = \sum_{\substack{x = t}}^T {T \choose x} \cdot P(\texttt{falsepositive})^x \cdot P(\texttt{falsepositive})^{100 - x} \approx 1\mathrm{e}^{-12}$$</span><br /></p>
<p>In order to finalize this we only need to make educated guesses about 2 parameters: the threshold value <span class="math inline"><em>t</em></span> and the total number of photos checked per year <span class="math inline"><em>T</em></span>. Apple throws out the number <span class="math inline"><em>t</em>=10</span> in their technical summary, which seems like a good place to start.</p>
<p>Assuming an average account generates 3-4 pictures a day to be checked then <span class="math inline"><em>T</em>1278</span> over a year. Plugging in those numbers, and we get <span class="math inline"><em>P</em>(<code>falsepositive</code>)0.00035</span> or 1 in 2858. <span class="sidenote">yikes</span>.</p>
<p>Given that we can go back and calculate the probability of observing, <span class="math inline"><em>P</em>(<code>match</code>)</span>, a match each day…</p>
<p><br /><span class="math display">$$P(\texttt{match}) = 1 - (( 1 - 0.00035)^3.5) \approx 0.001225 \approx \frac{1}{816}$$</span><br /></p>
<p>Or, a match once on average every 816 days for a person that only stores 3-4 photos per day.</p>
<p>Not everybody is every person though, if we applied the same <span class="math inline"><em>P</em>(<code>falsepositive</code>)</span> to a new parent who takes upwards of 50 photos per day, then their <span class="math inline"><em>P</em>(<code>match</code>)</span> is:</p>
<p><br /><span class="math display">$$P(\texttt{match}) = 1 - (( 1 - 0.00035)^50) \approx 0.01735 \approx \frac{1}{57}$$</span><br /></p>
<p>Or, a match on average every 57 days.</p>
<p>At this point I feel compelled to point out that these are <strong>average</strong> match probabilities. For the prolific photo taking parent who takes 18250 photos a year, the probability that they actually exceed the threshold in false matches is 6% <span class="sidenote">assuming <span class="math inline"><em>t</em></span> is 10</span>.</p>
<p>It is also worth mentioning that even though we ballparked <span class="math inline"><em>t</em></span> and <span class="math inline"><em>T</em></span> there are explicit constraints on what their values can be. If Apple generates a single <span class="math inline"><em>t</em></span> for all accounts, then <span class="math inline"><em>T</em></span> needs to be an approximation on the average number of photos an account stores per year. If Apple generates a different <span class="math inline"><em>t</em></span> value for every account, then it has enough information already to derive <span class="math inline"><em>P</em>(<code>observation</code>)</span> and break its own obfuscation.</p>
<hr/>
<p>Using what we now know we can revisit the server side operations show how the observer can calculate the probability of a real match given the probability of any q observation and the probability of a synthetic match.</p>
<p>For an “average” account that stores 3-4 photos per day we know that <span class="math inline"><em>P</em>(<code>match</code>)=0.001225</span>, allowing Apple, who defines P() to calculate:</p>
<p><br /><span class="math display">$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{synthetic}) + 0.001225}{P(\texttt{synthetic})}$$</span><br /></p>
<p>For our prolific “parent” account that stores 50 photos per day we know that <span class="math inline"><em>P</em>(<code>observation</code>)0.01735</span>, allowing Apple, who defines P() to calculate:</p>
<p><br /><span class="math display">$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{synthetic}) + 0.01735}{P(\texttt{synthetic})}$$</span><br /></p>
This clarifies our observation from earlier, if Apple define a global <span class="math inline"><em>P</em>(<code>synthetic</code>)</span> then different accounts will naturally have different server-side distributions of observations, and these can be used to tighten the estimates of true matches.
<p class="sidenote">
And, again, if Apple can define <span class="math inline"><em>P</em>(<code>synthetic</code>)</span> on a per-account basis then they have <strong>more</strong> information to use when tightening these estimates
</p>
<hr/>
<p>In this analysis we have deliberately left out other information that Apple, or someone who can compel Apple, may use to tighten these estimates e.g. derived from public social media feeds.</p>
</article>
<hr/>
<h2>
Recent Articles
</h2>
<p>2021-08-10 <a href="obfuscated_apples.html">Obfuscated Apples</a><br></p>
<footer>
Sarah Jamie Lewis
</footer>
</body>
</html>

View File

@ -1,17 +0,0 @@
# Hello World
This is a test. I like writing tests.... <p class="sidenote">This is another test</p> And there
we go
Regardless I will probably end up writing some amount of work here...
This is some more words
$$ x = \pi $$
$$ \alpha \beta \gamma \rho \sigma \delta \epsilon $$
$$ x^n + y^n = z^n $$

170
posts/obfuscated_apples.md Normal file
View File

@ -0,0 +1,170 @@
# Obfuscated Apples
Generating noise in a way which is indistinguishable from real signal is a ridiculously hard problem. Obfuscation does
not hide signal, it only adds noise. <p class="sidenote">if you take anything away from this article please let
it be this fact.</p>
Sadly, most people operate under the assumption that adding noise to a system is all that it takes to make the
signal unrecoverable. This logic is very clearly in operation in Apple's new proposal of on-device scanning
<a class="sidenote" href="https://www.apple.com/child-safety/pdf/CSAM_Detection_Technical_Summary.pdf">technical summary</a> which,among other things, proposes generating *synthetic* matches to hide the true number of *real* matches in the system.
I want to take this opportunity to break down how this kind of obfuscation can be defeated even when not considering
the fact that it is Apple themselves who are charged with generating and maintaining the safety parameters of the system.
<p class="sidenote">i.e. even if we treat the people who design and build this system as honest adversaries.</p>
## Sketching a Basic Scheme
For the sake of clarity I will omit the technical details of the private set intersection protocol, and the threshold
scheme, and we will operate under the assumption that both are cryptographically secure. We will also, for now,
assume that the database of images to compare is incorruptible.
At the heart of system is a (mostly) black box that contains a perceptual hash function that analyzes an images and
spits out a hash, this hash is then compared against a database of known hashes and if a match is found the system
reports `true` and otherwise reports `false`.
Throughout this article I will use **match** when talking about both true and false positives.
Without question, the server learns a match occurred on the phone during the PSI protocol.
<blockquote class="sidenote">"The output of PSI protocol on the server reveals whether there is a match or not" - Apple Technical Summary</blockquote>
When a certain threshold of real matches is reached, the server gains the ability to decrypt all real matches, a human
reviews them, and a determination is made.
As presented the system above has one major flaw <p class="sidenote">(besides the gross nature of co-opting a personal device
as a surveillance system)</p>: the server learns how many matches the device has reported prior to being able to
decrypt those matches. This is obviously very important metadata in the context of the system and as such needs to be
protected.
To do so Apple rely on the invocation of "Synthetic Vouchers" with the following property:
* "The probability that a device uploads a synthetic voucher instead of a real voucher for an image is calibrated to ensure the total number of synthetics is of the same order of magnitude as the threshold"
## Additional Metadata
So, that is it right? Problem averted? Let's not be too hasty...
From what we know so far there are a few interest parameters in this system, a threshold $t$, a probability of generating
a synthetic match $P(\texttt{synthetic})$ and the probability of a false positive match $P(\texttt{falsepositive})$.
We also know that Apple has constructed these parameters such that the probability of an account being flagged for human review (i.e. when number of matches $M > t$ is $P(flag) = 1\mathrm{e}^{-12}$ or one in one trillion.
<blockquote class="sidenote">"The threshold is selected to provide an extremely low (1 in 1 trillion) probability of incorrectly flagging a given account." - Apple Technical Summary</blockquote>
From these parameters we can start to derive some additional metadata that exists within the system:
### Matches over Threshold without Decryption
One of the most obvious sources of distinguishing information is built explicitly into the design. The threshold
scheme as proposed requires $t$ real matches in order to decrypt the inner envelopes containing the matches images.
Taking the number of observed matches, $o$, the system is able to derive a value $t' = o / t$ or, the number of
observed matches divided by the threshold.
Every time a new match is observed and $t'$ is over the threshold the system learns additional information regarding
the makeup of previous matches.
For example, if we set the $t = 10$, then after the system observes $10$ matches (made up of a number of an unknown number of real
or synthetic matches) then every additional match until decryption can be used to derive information about the previous
matches. If the next match to arrive does not allow decryption then the server can derive that there are at least 2 synthetic
matches in the bucket. This confidence increases with every observation that does not result in decryption.
Further we know that synthetic matches happen with probability $P(\texttt{synthetic})$ and as such the **rate** at which
observed matches in combination with the above allows us to define a probability $P(\texttt{match})$ given that
the observation did not result in decryption, and the known probability of a device generating a synthetic value $P(\texttt{synthetic})$.
$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{match}|\texttt{real}) \times P(\texttt{observation})}{P(\texttt{synthetic})}$$
We know that all matches will result in an observation and so...
$$P(\texttt{match}| \texttt{observation}) = \frac{1 \times P(\texttt{observation})}{P(\texttt{synthetic})}$$
Or more simply:
$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{observation})}{P(\texttt{synthetic})}$$
Finally, we can state the the probability of an observation (say over the period of a day) is the probability of a
real match on any given day plus the probability of a synthetic match on any given day:
$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{synthetic})+P(\texttt{match})}{P(\texttt{synthetic})}$$
Given that the probability of a synthetic match is defined by Apple, the only unknown in the system is the probability
of a match.
While this may start as an unknown <p class="sidenote">setting aside the fact that Apple already has enough data to derive this themselves</p>, Apple quickly generates a large amount of data relating to when new observations
are made. Since people are different in the ways these take and store photos and live in different parts of the world,
the exact probability of them triggering a check is dependent on them.
It is also obvious at this point that if $P(\texttt{synthetic})$ is generated *naively* <span class="sidenote">(*naively* as in
not account specific)</span> then *any* additional information known about a particular account e.g. the number of photos
they are likely to store over a period of time is enough for **anyone** who knows $P(\texttt{synthetic})$ to derive
$P(\texttt{match})$. <span class="sidenote">We will come back to this point shortly</span>
The secrecy then is highly dependent on both Apple never deriving $P(\texttt{observation})$ for themselves *and* on Apple
generating a distinct $P(\texttt{synthetic})$ for each account. Or rather, the privacy of one of the most sensitive aspects
of this system requires Apple both collecting no information on accounts, and also on Apple knowing enough about accounts
to derive the parameters necessary to keep the information private.
<hr/>
It is actually worse than that through. Remember that Apple has stated that the actual probability of observing $M > t$
false positive matches is 1 in a trillion.
We can work backwards from that number to derive $P(\texttt{falsepositive})$:
$$P(\texttt{flag}) = \sum_{\substack{x = t}}^T {T \choose x} \cdot P(\texttt{falsepositive})^x \cdot P(\texttt{falsepositive})^{100 - x} \approx 1\mathrm{e}^{-12}$$
In order to finalize this we only need to make educated guesses about 2 parameters: the threshold value $t$ and the total
number of photos checked per year $T$. Apple throws out the number $t = 10$ in their technical summary, which seems
like a good place to start.
Assuming an average account generates 3-4 pictures a day to be checked then $T \approx 1278$ over a year. Plugging in those
numbers, and we get $P(\texttt{falsepositive}) \approx 0.00035$ or 1 in 2858. <span class="sidenote">yikes</span>.
Given that we can go back and calculate the probability of observing, $P(\texttt{match})$, a match each day...
$$P(\texttt{match}) = 1 - (( 1 - 0.00035)^3.5) \approx 0.001225 \approx \frac{1}{816}$$
Or, a match once on average every 816 days for a person that only stores 3-4 photos per day.
Not everybody is every person though, if we applied the same $P(\texttt{falsepositive})$ to a new parent who takes upwards
of 50 photos per day, then their $P(\texttt{match})$ is:
$$P(\texttt{match}) = 1 - (( 1 - 0.00035)^50) \approx 0.01735 \approx \frac{1}{57}$$
Or, a match on average every 57 days.
At this point I feel compelled to point out that these are **average** match probabilities. For the prolific photo
taking parent who takes 18250 photos a year, the probability that they actually exceed the threshold in false matches
is 6% <span class="sidenote">assuming $t$ is 10</span>.
It is also worth mentioning that even though we ballparked $t$ and $T$ there are explicit constraints on what their
values can be. If Apple generates a single $t$ for all accounts, then $T$ needs to be an approximation on the average
number of photos an account stores per year. If Apple generates a different $t$ value for every account, then it has
enough information already to derive $P(\texttt{observation})$ and break its own obfuscation.
<hr/>
Using what we now know we can revisit the server side operations show how the observer can calculate the probability of a real
match given the probability of any q observation and the probability of a synthetic match.
For an "average" account that stores 3-4 photos per day we know that $P(\texttt{match}) = \approx 0.001225$, allowing
Apple, who defines P(\texttt{synthetic}) to calculate:
$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{synthetic}) + 0.001225}{P(\texttt{synthetic})}$$
For our prolific "parent" account that stores 50 photos per day we know that $P(\texttt{observation}) \approx 0.01735$, allowing
Apple, who defines P(\texttt{synthetic}) to calculate:
$$P(\texttt{match}| \texttt{observation}) = \frac{P(\texttt{synthetic}) + 0.01735}{P(\texttt{synthetic})}$$
This clarifies our observation from earlier, if Apple define a global $P(\texttt{synthetic})$ then different accounts will
naturally have different server-side distributions of observations, and these can be used to tighten the estimates of
true matches. <p class="sidenote">And, again, if Apple can define $P(\texttt{synthetic})$ on a per-account basis then
they have **more** information to use when tightening these estimates</p>
<hr/>
In this analysis we have deliberately left out other information that Apple, or someone who can compel Apple, may use
to tighten these estimates e.g. derived from public social media feeds.

162
ssb
View File

@ -1,22 +1,19 @@
#!/bin/sh
#!/bin/bash
function usage
{
echo \
"Usage: $0 [-d|-g|-h|-r] [-e HEADER_PATH] [-f FOOTER_PATH]" \
"Usage: $0 [-d|-h|-r] [-e HEADER_PATH] [-f FOOTER_PATH]" \
"[-m MD_RENDERER] [-o OUTPUT_DIR] [-p POSTS_DIR]"
}
function parse_optargs
{
while getopts "dghre:f:m:o:p:" opt; do
while getopts "dhre:f:m:o:p:" opt; do
case $opt in
d)
DISABLE_POSTS=true;;
g)
gen_template
exit 0;;
h)
usage
exit 0;;
@ -47,153 +44,6 @@ function set_default_args
OUTPUT_DIR=.
}
function echo_header_template
{
echo \
'<!DOCTYPE html>
<html lang=en>
<head>
<meta charset="utf-8">
<title>Title</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" type="text/css" href="styles.css">
</head>
<body>
<header>
<nav>
<strong>Title</strong>
<a href="./index.html">home</a>
<a href="mailto:">email</a>
</nav>
</header>
'
}
function echo_footer_template
{
echo \
'<footer>
Created with <a href="https://github.com/maciejzj/ssb">ssb</a> a simple
static blogger.
</footer>
</body>
</html>'
}
function echo_css_template
{
echo \
'body { max-width: 40em; margin: auto; padding: 1.5em }
header { font-size: 1.2em; }
footer { text-align: right; }
nav { }
h1 { }
h2 { }
h3 { }
h4 { }
h5 { }
h6 { }
p { }
a { }
li { }
ul { }
ol { }
dl { }
dt { }
dd { }
hr { }
figure { }
figcaption { }
img { }
video { }
table { }
th { }
tr { }
td { }
blockquote { }
pre { }
img, video { width: 100%; }
pre { max-width: 100%; overflow-x: auto; }
table{ max-width: 100%; overflow-x: auto; display: block; }
@media only screen and (max-width: 600px)
{
body { }
}
code span.al { color: #ff0000; font-weight: bold; }
/* Annotation */
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
/* Attribute */
code span.at { color: #7d9029; }
/* BaseN */
code span.bn { color: #40a070; }
/* BuiltIn */
code span.bu { }
/* ControlFlow */
code span.cf { color: #007020; font-weight: bold; }
/* Char */
code span.ch { color: #4070a0; }
/* Constant */
code span.cn { color: #880000; }
/* Comment */
code span.co { color: #60a0b0; font-style: italic; }
/* CommentVar */
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
/* Documentation */
code span.do { color: #ba2121; font-style: italic; }
/* DataType */
code span.dt { color: #902000; }
/* DecVal */
code span.dv { color: #40a070; }
/* Error */
code span.er { color: #ff0000; font-weight: bold; }
/* Extension */
code span.ex { }
/* Float */
code span.fl { color: #40a070; }
/* Function */
code span.fu { color: #06287e; }
/* Import */
code span.im { }
/* Information */
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
/* Keyword */
code span.kw { color: #007020; font-weight: bold; }
/* Operator */
code span.op { color: #666666; }
/* Other */
code span.ot { color: #007020; }
/* Preprocessor */
code span.pp { color: #bc7a00; }
/* SpecialChar */
code span.sc { color: #4070a0; }
/* SpecialString */
code span.ss { color: #bb6688; }
/* String */
code span.st { color: #4070a0; }
/* Variable */
code span.va { color: #19177c; }
/* VerbatimString */
code span.vs { color: #4070a0; }
/* Warning */
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }'
}
function gen_template
{
echo_header_template > ./header.html
echo_footer_template > ./footer.html
echo_css_template > ./styles.css
}
function get_posts
{
@ -218,7 +68,7 @@ function get_mod_date
function append_posts_list
{
posts_list="</article>"
posts_list="</article><hr/><h2>Recent Articles</h2>"
for post in $@; do
file_base=`basename $post .md`
date=`get_mod_date "$post"`
@ -226,7 +76,6 @@ function append_posts_list
post_link="$date [$post_title]($file_base.html)<br>\n"
posts_list="$posts_list$post_link"
done
echo "\n---\n"
echo $posts_list | sort -r
}
@ -236,8 +85,9 @@ function make_html_files
for md_file in $@; do
file_base=`basename $md_file .md`
output_file="$OUTPUT_DIR/$file_base.html"
post_title=`grep -m 1 "^# .*" $md_file | cut -c 3-`
append_posts_list $posts | cat $md_file - | $MD_RENDERER > $output_file
cat $HEADER_PATH $output_file $FOOTER_PATH | tee $output_file
cat $HEADER_PATH $output_file $FOOTER_PATH | sed -e "s/\$TITLE/$post_title/g" | tee $output_file
done
}

View File

@ -7,14 +7,16 @@ body {
font-family: Spectral;
background: #111;
color: #eee;
max-width: 40em;
min-width: 60%;
max-width: 60%;
margin: auto;
padding: 1.5em;
}
article {
margin-left: auto;
margin-left: 0px;
margin-right: auto;
max-width: 60%;
}
.sidenote {
@ -22,14 +24,14 @@ article {
color: #999;
font-style: italic;
font-size: 0.9em;
width: 20%;
margin-right: -21%;
width: 40%;
margin-right: -100%;
position: relative;
transform: translateY(-30%);
transform: translateY(-100%);
}
header { font-size: 1.1em; }
footer { margin-top: 1.5em; text-align: right; }
footer { border-top: 0.25em solid #eee; margin-top: 1.5em; text-align: right; }
nav { margin-bottom: 1.5em; padding-bottom: 0.5em; }
nav a { margin-left: 0.2em; }
h1 { margin: 0.5em 0em; }
@ -44,7 +46,7 @@ p {
hyphens: auto;
text-justify: inter-word;
overflow-wrap: break-word;
margin: 0.5em 0 0.5em 0;
margin: 1em 0 0.5em 0;
}
a { color: #bbb;; text-decoration-thickness: 0.08em; }
li { line-height: 1.5em;}
@ -57,7 +59,7 @@ hr { color: #bfbfbf; border: solid 0.05em; margin: 1.5em 0em; }
figure { color: #646464; margin: 0.5em 0; padding: 0.5em; }
figcaption { padding-top: 0.5em; font-style : italic; }
img { }
video { margin; 1em; }
video { margin: 1em; }
table { border-collapse: collapse; }
tr { background-color: white; }
tr:nth-child(2n) { background-color: #f9f9f9; }
@ -86,12 +88,7 @@ img, video { width: 100%; }
pre { max-width: 100%; overflow-x: auto; }
table{ max-width: 100%; overflow-x: auto; display: block; }
@media only screen and (max-width: 600px)
{
body { font-size: 0.9em; }
nav { display: flex; flex-direction: column;}
nav a { margin-left: 0; margin-top: 0.4em; }
}
code span.al { color: #ff0000; font-weight: bold; }
/* Annotation */