-
Notifications
You must be signed in to change notification settings - Fork 33
/
Copy pathlearn-lang-diary-part-nine.lyx
11827 lines (9123 loc) · 247 KB
/
learn-lang-diary-part-nine.lyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#LyX 2.3 created this file. For more info see http://www.lyx.org/
\lyxformat 544
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass article
\begin_preamble
\usepackage{url}
\usepackage{slashed}
\end_preamble
\use_default_options false
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding utf8
\fontencoding global
\font_roman "times" "default"
\font_sans "helvet" "default"
\font_typewriter "cmtt" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\use_microtype false
\use_dash_ligatures false
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref true
\pdf_bookmarks true
\pdf_bookmarksnumbered false
\pdf_bookmarksopen false
\pdf_bookmarksopenlevel 1
\pdf_breaklinks true
\pdf_pdfborder true
\pdf_colorlinks true
\pdf_backref false
\pdf_pdfusetitle true
\papersize default
\use_geometry false
\use_package amsmath 2
\use_package amssymb 2
\use_package cancel 1
\use_package esint 0
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 0
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 0
\use_minted 0
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\is_math_indent 0
\math_numbering_side default
\quotes_style english
\dynamic_quotes 0
\papercolumns 1
\papersides 1
\paperpagestyle default
\listings_params "basicstyle={\ttfamily},basewidth={0.45em}"
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Title
Language Learning Diary - Part Nine
\end_layout
\begin_layout Date
Oct 2022 – Present
\end_layout
\begin_layout Author
Linas Vepštas
\end_layout
\begin_layout Abstract
The language-learning effort involves research and software development
to implement the ideas concerning unsupervised learning of grammar, syntax
and semantics from corpora.
This document contains supplementary notes and a loosely-organized semi-chronol
ogical diary of results.
The notes here might not always makes sense; they are a short-hand for
my own benefit, rather than aimed at you, dear reader!
\end_layout
\begin_layout Section*
Introduction
\end_layout
\begin_layout Standard
Part Nine of the diary explores continuous learning.
\end_layout
\begin_layout Section*
Summary Conclusions
\end_layout
\begin_layout Standard
A summary of what is found in this part of the diary:
\end_layout
\begin_layout Itemize
None yet.
\end_layout
\begin_layout Section*
Hard lessons learned
\end_layout
\begin_layout Standard
Experiment-17 is the teacher.
Here is what we learned:
\end_layout
\begin_layout Itemize
The disjuncts in `r16-merge.rdb` and `r13-all-in-one.rdb` are insufficient
to generate interesting sentences.
There are too few of them.
\end_layout
\begin_layout Itemize
Apparently, trimming has depleted the ranks.
Thus, although they "look good" when examine individually, they're not
rich enough to be used.
\end_layout
\begin_layout Standard
Here is what we can do differently, going forwards:
\end_layout
\begin_layout Itemize
This suggests clustering should be more aggressive.
Clustering enriches the number of available disjuncts on any given word.
\end_layout
\begin_layout Itemize
A solution to not having enough disjuncts of the right shape is to supplements
existing disjuncts with optional single links taken from word-pairs.
This explodes the RAM usage in LG, up to 10 GB or 20GB or maybe more, depending.
\end_layout
\begin_layout Itemize
The LG `dict-atomese` backend was extended to use word-pairs and also ANY
links.
Disjuncts can now have optional word-pair connectors on them.
(Done, Nov 2022)
\end_layout
\begin_layout Itemize
Since the LG atomese dict can now use single word-pairs, that means it can
do MST/MPG parsing.
Thus, we can get rid of the atomspace MST parser.
(Done, Jan 2023)
\end_layout
\begin_layout Itemize
The LG backend can also supplement disjuncts with ANY links.
(Done, Nov 2022)
\end_layout
\begin_layout Itemize
The MST/MPG mode can also use ANY links.
(Done, Nov 2022)
\end_layout
\begin_layout Itemize
As a result, the LG parser can do all of it -- random-tree ANY parsing,
MST/MPG parsing, and Section/disjunct parsing.
\end_layout
\begin_layout Itemize
This creates a possibility of doing "continuous learning": learning word
pairs and disjuncts at the same time.
\end_layout
\begin_layout Itemize
However, the more complex portions cannot run until marginals recomputed.
This suggests a natural awake/asleep cycle.
During the awake cycle, data is ingested.
During the asleep cycle, marginals are (re-) computed, MI is (re-)computed,
similarities are updated.
This is very nice, it gets rid of the pipeline.
\end_layout
\begin_layout Itemize
So it seems like it's time to abolish the pipeline.
\end_layout
\begin_layout Itemize
Easier said than done (Dec 2022) Computing MI on the fly raises issues with
caching, stale data, write-back to the DB, read from the DB, general data
flow.
It's a bit messy.
So this has to be a back-burner project.
\end_layout
\begin_layout Itemize
For example, a caching ProxyNode can be created.
This would effectively be the old ECAN idea, this time done right.
\end_layout
\begin_layout Itemize
BTW, we can do GOE similarity with just word-pair MI.
So clustering can begin before disjuncts have been created.
\end_layout
\begin_layout Standard
\end_layout
\begin_layout Section*
The Plan
\end_layout
\begin_layout Itemize
Start with pair counting.
Do NOT trim until after MST.
\end_layout
\begin_layout Itemize
Use uniform sentence lengths.
\end_layout
\begin_layout Itemize
During MST, count the pairs that contributed to the MST.
Lets call this
\begin_inset Quotes eld
\end_inset
second counting
\begin_inset Quotes erd
\end_inset
.
Never trim second-counted pairs (at least not for the next few steps.)
\end_layout
\begin_layout Itemize
Perform tentative GOE clustering before MST, perform MST with and without
clusters, try to see which is betters!? How to tell which is better? I
guess higher totla MI.
But how to count/trakc which contributed the most, and still maintain detailed
balance? I.e.
how to
\begin_inset Quotes eld
\end_inset
undo
\begin_inset Quotes erd
\end_inset
clustering? Or will second-counting be sufficinet to track?
\end_layout
\begin_layout Section*
TODO List
\end_layout
\begin_layout Standard
To-do items that are NOT covered in this chapter, but should be done someday,
some-how:
\end_layout
\begin_layout Itemize
How does the number of word-pairs scale as a function of vocabulary size?
\end_layout
\begin_layout Itemize
How does vocabulary size scale as a function of corpus size? We've monitored
the above quantities repeatedly, but never really worked out the scaling
relationships.
\end_layout
\begin_layout Section*
Notes
\end_layout
\begin_layout Standard
Nov 2022 – Tried restarting with `run-1-marg-tranche-123.rdb` which is not
trimmed.
But it's huge.
300K x 300K and *lots* of the words have backslashes in them Yuck! 52GB
to load ...
Need to start over.
\end_layout
\begin_layout Standard
Bringup of the above ideas is in Expt-18.
\end_layout
\begin_layout Section*
Hypervector ruminations
\end_layout
\begin_layout Standard
Some questions:
\end_layout
\begin_layout Itemize
Given a vector in the GOE (say, a particular word, with coordinates measured
via MI, offset and normalized to unit length), which is the nearest cube-corner
(bipolar hypervector)? I assume it's the one with with all coordinated
rounded to either +1 or -1.
But this needs proof.
\end_layout
\begin_layout Itemize
What is the angular distance to that corner?
\end_layout
\begin_layout Itemize
Given a corner in the cube, what is the nearest actual vector?
\end_layout
\begin_layout Itemize
What is the distribution of Hamming distance vs.
actual distance? That is, pick two actual vectors from the dataset.
Their dot product is the
\begin_inset Quotes eld
\end_inset
actual
\begin_inset Quotes erd
\end_inset
distance.
The Hamming distance between them is the Hamming distance between their
nearest cube corners.
\end_layout
\begin_layout Itemize
Given a random cube corner, what is the most efficient way of finding the
nearest element in the dataset?
\end_layout
\begin_layout Standard
Other unrelated questions:
\end_layout
\begin_layout Itemize
What is the QR decomposition of word-pair MI? viz
\begin_inset Formula $M=QR$
\end_inset
and
\begin_inset Formula $Q$
\end_inset
is orthogonal and
\begin_inset Formula $R$
\end_inset
is upper right triangular...
is there significance to this decomposition? Is there a past (left) vs
(right) future light-cone thing going on?
\end_layout
\begin_layout Itemize
Can the relationship between free groups and hypervectors be exploited?
For example, consider the free group
\begin_inset Formula $F$
\end_inset
in two generators
\begin_inset Formula $A,B$
\end_inset
.
Dividing by the commutator
\begin_inset Formula $AB-BA=0$
\end_inset
just abelianizes this, reducing it to 2D Cartesian space (well, I'm skipping
details; it goes to
\begin_inset Formula $\mathbb{Z}\times\mathbb{Z}$
\end_inset
, and the module over the reals with more constraints gives Cartesion space).
We can do the same tricks in very high dimensions, except this time leaving
some of the generators free, or perhaps applying other constraints, to
create presentations of more complex groups, and then working with modules
over them.
This then has the flavor of a history monoid or trace monoid, but now as
groups/modules.
The
\begin_inset Quotes eld
\end_inset
central
\begin_inset Quotes erd
\end_inset
part is fully abelianized and high-dimensional, but additional
\begin_inset Quotes eld
\end_inset
dimensions
\begin_inset Quotes erd
\end_inset
might not be, retaing a free gtrucuture, or partially constrained.
The question: can this lead to any useful insights ior tools?
\end_layout
\begin_layout Itemize
BTW, the above provides an explanation for the perception of fractals and
hyperbolic spaces in hallucinations (DMT, LSD, etc.): the hypothesis is
this.
Neural (cortical) columns in the visual cortex provide basic structures
for parallel processing, but, without cross connections, the processing
is that of a free monoid.
To perceive 3D spaces, there are cross-column connections that abelianize
the free module down to Caretesian 3D space, which we perceive
\begin_inset Quotes eld
\end_inset
directly
\begin_inset Quotes erd
\end_inset
.
The hallucinogens disrupt the communications between columns, exposing
the basic parallel, free structure: i.e.
the fractals and hyperbolic spaces.
\end_layout
\begin_layout Itemize
The above also suggests that DMT might allow the brain to 4D or 5D perception
more quickly.
That is, we can project 4D, 5D shapes down to a 2D computer screen/visual
cortex, and the task is to learn, in a
\begin_inset Quotes eld
\end_inset
natural
\begin_inset Quotes erd
\end_inset
way, to perceive the 4D space.
Tis would seem to require rewiring the cortical columns from their 3D module
presentation to a 4D presentation, and perhaps the hallucinnogens would
ease the required disruption and rewiring.
See web page,
\begin_inset CommandInset href
LatexCommand href
name "Hallucination of Fractals"
target "https://linas.org/math/hallucination.html"
literal "false"
\end_inset
.
\end_layout
\begin_layout Standard
Then the oldie but goodie:
\end_layout
\begin_layout Itemize
The GOE vectors form a de facto vierbein.
What happens when I move from word to word? Can I define a connection?
a curvature? a torsion? Even if it's flat, when I travel around a loop,
is there a holonomy?
\end_layout
\begin_layout Section*
Counting vs.
Bayesian Probability
\end_layout
\begin_layout Standard
Open question: How can the counting methods that we've been employing so
far, be bridged back to Bayesian theory? For the simplest possible case,
there is a clear bridge between counting and Bayesian probability.
It is summarized by the vapid catch-phrase
\begin_inset Quotes eld
\end_inset
update your priors
\begin_inset Quotes erd
\end_inset
.
But how can this be done for structure learning?
\end_layout
\begin_layout Subsection*
The Bernoulli Process
\end_layout
\begin_layout Standard
Below follows a laborious analysis of the simplest possible case: the Bernoulli
process of a coin toss.
In this case, there is a very direct relation between counting and Bayesian
theory: The Bayesian prior is given by the beta function, which increments
by one with each coin-toss result.
One performs counting on coin-toss results, and the beta function provides
the correct way to
\begin_inset Quotes eld
\end_inset
update your priors
\begin_inset Quotes erd
\end_inset
with each new toss.
Lets proceed with this laborious review.
It tries to hit on all the key assumptions going into the analysis.
\end_layout
\begin_layout Standard
If we know (a priori) that we are dealing with a coin toss (only two outcomes:
heads or tails) and if we know (a priori) that the same coin is being used
(its a stationary process) and finally, if we know (a priori) that there
is no interference with the coin, then, by careful analysis, we can conclude
that the Bernoulli process can be described by a single real number
\begin_inset Formula $0\le p\le1$
\end_inset
giving the probablity that the next coin toss is
\begin_inset Formula
\begin{align*}
P\left(\mbox{heads}\right) & =p\\
P\left(\mbox{tails}\right) & =1-p
\end{align*}
\end_inset
But we do not have any a priori information about what
\begin_inset Formula $p$
\end_inset
might be.
The task is to try to discover this number, by observation of an actual
sequence of coin tesses.
How does one do this? Well, the theory of Bayesian probability provides
a strict methodology.
Let's review this.
\end_layout
\begin_layout Standard
Since one does not know what
\begin_inset Formula $p$
\end_inset
might be, one creates a
\begin_inset Quotes eld
\end_inset
prior
\begin_inset Quotes erd
\end_inset
, a reasonable assumption about it.
Here, the reasonable assumption would be that the likelihood of
\begin_inset Formula $p=\theta$
\end_inset
is uniformly distributed over the interval
\begin_inset Formula $0\le\theta\le1$
\end_inset
.
That is, a hypothesis
\begin_inset Formula $h_{\theta}$
\end_inset
is made that
\begin_inset Formula $p=\theta$
\end_inset
and the letter
\begin_inset Formula $\theta$
\end_inset
is acting as a label indicating that we are currently working with hypothesis
\begin_inset Formula $h_{\theta}$
\end_inset
.
In the present case, we have uncountably many hypothesis.
We express the idea of zero knowledge by assigning a uniform distribution
\begin_inset Formula $\mu\left(h_{\theta}\right)=1$
\end_inset
for the hypothesis
\begin_inset Formula $h_{\theta}$
\end_inset
.
I write
\begin_inset Formula $\mu\left(h_{\theta}\right)$
\end_inset
instead of
\begin_inset Formula $P\left(h_{\theta}\right)$
\end_inset
for the prior, to emphasize that
\begin_inset Formula $\mu$
\end_inset
can be, and should be thought of as a measure-theoretic measure on the
unit interval.
\end_layout
\begin_layout Standard
Lets assume that
\begin_inset Formula $N\ge0$
\end_inset
coin tosses have been observed, giving a sequence
\begin_inset Formula $x=\left[x_{1},x_{2},\cdots,x_{N}\right]$
\end_inset
with each
\begin_inset Formula $x_{k}\in\left\{ \mbox{heads},\mbox{tails}\right\} $
\end_inset
.
The probability of observing the next toss
\begin_inset Formula $y=x_{N+1}\in\left\{ \mbox{heads},\mbox{tails}\right\} $
\end_inset
is factored through the likelihood of hypothesis:
\begin_inset Formula
\begin{align*}
P\left(y\vert x\right)= & \int_{\theta}P\left(y\left|h_{\theta}\right.\right)P\left(\left.h_{\theta}\right|x\right)\\
= & \int_{\theta}P\left(y\left|h_{\theta}\right.\right)\mu\left(\left.h_{\theta}\right|x\right)
\end{align*}
\end_inset
As before, I use the non-standard notation
\begin_inset Formula $\mu\left(\left.h_{\theta}\right|x\right)=P\left(\left.h_{\theta}\right|x\right)$
\end_inset
to indicate that
\begin_inset Formula $\mu$
\end_inset
is a measure on the space of hypothesis.
This is useful for several reasons: first, it avoids a confusing proliferation
of
\begin_inset Formula $P$
\end_inset
's, twisty little passages all alike.
Best if different actors wore different clothes, so that we can tell them
apart more easily.
Secondly, it helps distinguish likelihood from probability, and makes clear
that distributions on hypothesis are very different than probabilities
of future, unknown events.
The space of hypothesis belongs to the
\begin_inset Quotes erd
\end_inset
world model
\begin_inset Quotes erd
\end_inset
: the model that we keep in our heads, to represent the universe outside
of ourselves.
The integral over hypothesis makes it clear that the natural interpretation
of Bayesian theory is the Many-Worlds Interpretation: we assign each possible
world
\begin_inset Formula $h_{\theta}$
\end_inset
a measure
\begin_inset Formula $\mu\left(\left.h_{\theta}\right|x\right)$
\end_inset
.
In statistics, this is called the
\begin_inset Quotes eld
\end_inset
belief
\begin_inset Quotes erd
\end_inset
that the possible world
\begin_inset Formula $h_{\theta}$
\end_inset
is the true world in which we live.
This talk of
\begin_inset Quotes eld
\end_inset
beleifs
\begin_inset Quotes erd
\end_inset
indicated that we must apply modal logic, and perhaps Kripke semantics
or at least some kind of possible-world semantics, when reasoning about
Bayesian priors.
Last but not least, the factorization across possible worlds can now be
written as
\begin_inset Formula
\begin{align*}
P\left(y\vert x\right)= & \int_{\theta}P\left(y\left|h_{\theta}\right.\right)\mu\left(\left.h_{\theta}\right|x\right)\\
= & \int_{\mu}P\left(y\left|h_{\theta}\right.\right)
\end{align*}
\end_inset
where
\begin_inset Formula $\int_{\mu}$
\end_inset
is the conventional notation for an integral performed over a measureable
space; viz.
a Borel set endowed with a topology; presumably the weak topology that
allows additive measures and all the other goodness implied by the Kolmogorov
equivalence between probability and measure theory.
To retain measurability, we require that the set of all possible worlds
have a probability of one:
\begin_inset Formula
\[
1=\int_{\mu}=\int_{\theta}\mu\left(\left.h_{\theta}\right|x\right)
\]
\end_inset
In the language of statistical mechanics, the set
\begin_inset Formula $\left\{ \mu\left(\left.h_{\theta}\right|x\right)\mbox{ s.t. }0\le\theta\le1\right\} $
\end_inset
is called a
\begin_inset Quotes eld
\end_inset
canonical ensemble
\begin_inset Quotes erd
\end_inset
.
\end_layout
\begin_layout Standard
The hypothesis space includes a number of priors that are not updated.
These are invariants; they don't change because they are, in a sense, outside
of the scope of the problem being observed.
The framework being developed here does not provide a machanism for updating
these beleifs:
\end_layout
\begin_layout Itemize
Assumptions about the physics of coin tosses.
\end_layout
\begin_layout Itemize
Assumption that only two outcomes are possible.
\end_layout
\begin_layout Itemize
Assumption that the same coin is tossed, each time; that the coin does not
change with time.
\end_layout
\begin_layout Itemize
Assumption that the outcome of a coin toss is correctly observed, and that
there is no observational noise (stocahstic or systemaic).
\end_layout
\begin_layout Itemize
Assumption that the tools of Bayesian theory are accessible for use, and
that the theory itself is correct.
\end_layout
\begin_layout Itemize
Assumption that logical interence can be performed and that the outcome
of using logic is trustworthy.
\end_layout
\begin_layout Itemize
Assumption that a toolset for algebraic manipulations is accessible and
employable.
\end_layout
\begin_layout Standard
The first four bullets above seem reasonable, as they can be obviously varied:
a wind may be blowing; the coin may be pyramidal; the coin might bend or
change shape with each toss.
Poor eyesight might result in some of the tails being perceived as heads
(a systematic bias).
The last bullets are weirder: we normally accept logic and algebra as foundatio
nal, fixed and always true.
In practice, however, it is not so clear-cut: algebraic calculations may
be intractable; logical reasoning chains too deep.
Mistakes (bugs) in the software and problem setup may occur.
This text that you are reading right now, that explains things, may itself
explain things incorrectly.
Perhaps superior techniques remain undiscovered.
All these issues are set aside for this example, although they lurk for
the general case.
\end_layout
\begin_layout Standard
Using the a priori assumptions about the physics of coin tosses and the
mathematical nature of stochastic processes, then applying logical inference
to the coin-toss problem, we may conclude, in an a priori fashion, that,
given hypothesis
\begin_inset Formula $h_{\theta}$
\end_inset
, the probability of the coin toss is result is
\begin_inset Formula
\[
P\left(y\left|h_{\theta}\right.\right)=\begin{cases}
\theta & \mbox{if }y=\mbox{heads}\\
1-\theta & \mbox{if }y=\mbox{tails}
\end{cases}
\]
\end_inset
To obtain an expression for
\begin_inset Formula $\mu\left(\left.h_{\theta}\right|x\right)$
\end_inset
, we apply Bayes rule, which relates the likelihood of a hypothesis to the
probability of prior outcomes:
\begin_inset Formula
\[
\mu\left(\left.h_{\theta}\right|x\right)=P\left(\left.h_{\theta}\right|x\right)=\alpha P\left(x\left|h_{\theta}\right.\right)\mu\left(h_{\theta}\right)
\]
\end_inset
We already have that the prior, before any observations at all have been
made, is
\begin_inset Formula
\[
\mu\left(h_{\theta}\right)=\mu\left(\left.h_{\theta}\right|\varnothing\right)=1
\]
\end_inset
The coefficient
\begin_inset Formula $\alpha$
\end_inset
is normalization constant, forced by
\begin_inset Formula $1=\int_{\theta}\mu\left(\left.h_{\theta}\right|x\right)$
\end_inset
as noted above.
To obtain the
\begin_inset Quotes eld
\end_inset
a posteriori prior
\begin_inset Quotes erd
\end_inset
\begin_inset Formula $\mu\left(\left.h_{\theta}\right|x\right)$
\end_inset
that holds after measurements
\begin_inset Formula $x$
\end_inset
, we need an expression for
\begin_inset Formula $P\left(x\left|h_{\theta}\right.\right)$
\end_inset
.
\end_layout
\begin_layout Standard
The assumption of independence implies that the Cartesian product can be
used on the event space.
That is, we are working with a Bernoulli process, not a Markov process,
nor something more complex.
Thus, a priori logical inference allows us to conclude that we can work
with the cylinder set measure on the weak topology on a Cartesian product
space.
Cylinder set measures factorize in a
\begin_inset Quotes eld
\end_inset
trivial
\begin_inset Quotes erd
\end_inset
way, which allows the expression
\begin_inset Formula
\[
P\left(x\left|h_{\theta}\right.\right)=\prod_{k=1}^{N}P\left(x_{k}\left|h_{\theta}\right.\right)
\]
\end_inset
This reduces the problem to simply counting the number of times
\begin_inset Formula $N_{H}$
\end_inset
that heads has been observed, and
\begin_inset Formula $N_{T}=N-N_{H}$
\end_inset
that tails has been observed.
Plugging in,
\begin_inset Formula
\[
P\left(x\left|h_{\theta}\right.\right)=\theta^{N_{H}}\left(1-\theta\right)^{N_{T}}
\]
\end_inset
This provides a closed-form expression for the probability of observing
a specific result of a coin toss, given a prior observational history:
\begin_inset Formula
\begin{align*}
P\left(y\vert x\right)= & \int_{\mu}P\left(y\left|h_{\theta}\right.\right)\\
= & \int_{\theta}P\left(y\left|h_{\theta}\right.\right)\mu\left(\left.h_{\theta}\right|x\right)\\
= & \alpha\int_{\theta}P\left(y\left|h_{\theta}\right.\right)P\left(x\left|h_{\theta}\right.\right)\\
= & \begin{cases}
\alpha\beta\left(N_{H}+1,N_{T}\right) & \mbox{if }y=\mbox{heads}\\
\alpha\beta\left(N_{H},N_{T}+1\right) & \mbox{if }y=\mbox{tails}
\end{cases}
\end{align*}
\end_inset
where
\begin_inset Formula
\[
\beta\left(a,b\right)=\int d\theta\;\theta^{a}\left(1-\theta\right)^{b}=B\left(a+1,b+1\right)
\]
\end_inset
and
\begin_inset Formula
\[
B\left(z_{1},z_{2}\right)=\frac{\Gamma\left(z_{1}\right)\Gamma\left(z_{2}\right)}{\Gamma\left(z_{1}+z_{2}\right)}
\]
\end_inset
is the conventional Beta function (the reciprocal of the binomial coefficient).
\end_layout
\begin_layout Standard
At last, we have arrived.
We've reduced the problem of observing coin tosses into two parts:
\end_layout
\begin_layout Itemize
A large, complicated set of a priori assumptions and logical, mathematical,
algebraic manipulations, all of which can be done before observing any
coin tosses at all.
\end_layout