-
Notifications
You must be signed in to change notification settings - Fork 23
/
4_2_ST-dag.tex
1824 lines (947 loc) · 42.6 KB
/
4_2_ST-dag.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%\documentclass[mathserif]{beamer}
\documentclass[handout]{beamer}
%\usetheme{Goettingen}
\usetheme{Warsaw}
%\usetheme{Singapore}
%\usetheme{Frankfurt}
%\usetheme{Copenhagen}
%\usetheme{Szeged}
%\usetheme{Montpellier}
%\usetheme{CambridgeUS}
%\usecolortheme{}
%\setbeamercovered{transparent}
\usepackage[english, activeacute]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amssymb}
\usepackage{dsfont}
\usepackage{graphics}
\usepackage{cases}
\usepackage{graphicx}
\usepackage{pgf}
\usepackage{epsfig}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amstext}
\usepackage[ruled,vlined,lined]{algorithm2e}
\usepackage{amsmath}
\usepackage{epic}
\usepackage{epsfig}
\usepackage{fontenc}
\usepackage{framed,color}
\usepackage{palatino, url, multicol}
\usepackage{listings}
%\algsetup{indent=2em}
\vspace{-0.5cm}
\title{Directed Graphical Models}
\vspace{-0.5cm}
\author[Felipe Bravo Márquez]{\footnotesize
%\author{\footnotesize
\textcolor[rgb]{0.00,0.00,1.00}{Felipe José Bravo Márquez}}
\date{ \today }
\begin{document}
\begin{frame}
\titlepage
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}{Directed Graphical Models}
\scriptsize{
\begin{itemize}
\item Probabilistic graphical models (PGMs) provide a visual representation of the underlying structure of a joint probablity distribution \cite{ruozzi}.
\item In this class we will focus on directed graphical models (DGMs), which are one type of PGM.
\item Directed graphical models (DGMs) are a family of probability distributions that admit a compact parametrization that can be naturally described using a \textbf{directed acyclic graph}.
\item DGMs were created by Turing prize awardee Judea Pearl under the name of \textbf{Bayesian networks}.
\item We won't use that term much in this class because statistical inference for DGMs can be performed using frequentist or Bayesian methods, which makes the name misleading \cite{wasserman2013all}.
\end{itemize}
}
\end{frame}
\begin{frame}{Judea Pearl}
\begin{figure}[h!]
\centering
\includegraphics[scale=0.8]{pics/judea.jpg}
\caption{Judea Pearl, source: \url{https://www.amacad.org/person/judea-pearl}}
\end{figure}
\end{frame}
\begin{frame}{Directed Graphical Models}
\scriptsize{
\begin{itemize}
\item DGMs link two very different branches of mathematics: probability and graph theory.
\item They also have intriguing connections to philosophy, particularly the question of \textbf{causality}.
\item At the same time, they are widely used in statistics and machine learning.
\item DGMs can be used to solve problems in fields as diverse as medicine, language processing, vision, and many others \cite{ermon_kuleshov}.
\item But before introducing them more formally we need to learn the following mathematical concepts:
\begin{enumerate}
\scriptsize{
\item Conditional independence
\item The chain rule of probability
\item Directed acyclical graphs (DAGs) }
\end{enumerate}
\end{itemize}
}
\end{frame}
\begin{frame}{Conditional Independence}
\scriptsize{
\begin{itemize}
\item Two random variables $X$ and $Y$ are independent, written $X \perp Y$ if $f(x,y)=f(x)f(y)$ for all values of $x$, $y$.
\item This also implies that $f(x|y)=f(x)$ and $f(y|x)=f(y)$.
\item Notice that $f$ can be either a density function for continous random variables or a probability mass function for discrete random variables.
\item In the same way $f(x|y)$ and $f(y|x)$ correspond to conditional densities or mass functions.
\item Now, suppose we have three random variables $A,B,C$.
\item $A$ and $B$ are conditionally independent given $C$, written $A \perp B | C$, if:
\begin{displaymath}
f(a|b,c) = f(a|c)
\end{displaymath}
for all a, b and c.
\end{itemize}
}
\end{frame}
\begin{frame}{Conditional Independence}
\scriptsize{
\begin{itemize}
\item Intuitively, $A \perp B | C$ means that, once you know $C$, $B$ provides no extra information about $A$.
\item Notice that $A \perp B | C$ doesn't necessarily imply that $A \perp B$.
\end{itemize}
\begin{block}{Example}
\begin{itemize}
\item Let $H,V,A$ be three random variables representing a person's height, vocabulary and age.
\item H and V are dependent $f(v|h) \neq f(v)$
since very small people tend to be children, known for their more basic vocabularies.
\item But knowing that two people are 19 years old (i.e., conditional on age) there is no reason to think that one person's vocabulary is larger if we are told that they are taller:
\begin{displaymath}
f(v|h,a)=f(v|a) \Leftrightarrow V \perp H | A
\end{displaymath}
\end{itemize}
\end{block}
}
\end{frame}
\begin{frame}{The chain rule of probability}
\scriptsize{
\begin{itemize}
\item For a set of random variables $X_1,\dots, X_n$, the chain rule of probability allow us to express the joint probability function $f(x_1,x_2,\dots, x_n)$ as a product of $n$ conditional probabilities:
\begin{displaymath}
f(x_1,x_2,\dots,x_n)=f(x_1)f(x_2|x_1)f(x_3|x_2,x_1)\dots f(x_n|x_{n-1},\dots,x_2,x_1).
\end{displaymath}
\item For example for the case of $n=3$
\begin{displaymath}
f(x_1,x_2,x_3)=f(x_1)f(x_2|x_1)f(x_3|x_2,x_1)
\end{displaymath}
using the definition of conditional probabilities we have that
\begin{displaymath}
f(x_2|x_1) = f(x_1,x_2)/f(x_1)
\end{displaymath}
and \begin{displaymath}
f(x_3|x_2,x_1)=f(x_1,x_2,x_3)/f(x_1,x_2)
\end{displaymath}
\item By replacing these expressions into the chain of products, many expressions cancel out and we obtain $f(x_1,x_2,x_3)$.
\end{itemize}
}
\end{frame}
\begin{frame}{Joint Probabilities}
\scriptsize{
\begin{itemize}
\item Expressing a joint probability function can be expensive.
\item For example if there are $m$ binary random variables, the complete distribution is specificied by $2^{m}-1$ joint probabilities.
\item For example, if we have two Boolean variables $(A,B)$, we need the probabilities $\mathbb{P}(A,B)$, $\mathbb{P}(\neg A,B)$, $\mathbb{P}(A,\neg B)$, and $\mathbb{P}(\neg A, \neg B)$.
\item A joint distribution for a set of random variables gives all the information there is about the distribution.
\item Note that for $m$ boolean variables, the joint distribution contains $2^m$ values.
\item However, the sum of all the joint probabilities must be 1 because the probability of all possible outcomes must be 1.
\item Thus, to specify the joint distribution, one needs to specify $2^{m-1}$ numbers.
\end{itemize}
}
\end{frame}
\begin{frame}{Joint Probabilities}
\scriptsize{
\begin{itemize}
\item The main idea of DGMs is that by assuming that some variables are conditionally independent we can use the probability chain rule to represent the joint distribution more compactly.
\item For example in the previous example, a conditional independence assumption could be: $X_3 \perp X_1| X_2$: $f(x_3|x_2,x_1)= f(x_3|x_2)$.
\item Then our joint distribution would be reduced to:
\begin{displaymath}
f(x_1,x_2,x_3)=f(x_1)f(x_2|x_1)f(x_3|x_2)
\end{displaymath}
\item In the following slides we will clarify these concepts with the Student example given in \cite{koller2009probabilistic}.
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item Consider the problem faced by a company trying to hire a recent college graduate.
\item The company's goal is to hire intelligent employees, but there is no way to test intelligence directly.
\item However, the company has access to the student's SAT scores\footnote{The SAT is a standardized test widely used for college admissions in the United States.}, which are informative but not fully indicative.
\item Thus, our probability space is induced by the two random variables Intelligence (I) and SAT (S).
\item For simplicity, we assume that each of these takes two values: $Val(I) = \{i_1 , i_0 \}$, which represent
the values high intelligence ($i_1$ ) and low intelligence $(i_0)$.
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item Similarly $Val(S) = \{s_1 , s_0 \}$, which
also represent the values high (score) and low (score), respectively.
\item Thus, our joint probability function in this case has four entries.
\item For example, one possible joint probability function $f$ would be:
\begin{table}
\centering
\begin{tabular}{cc|c} \hline
$I$ & $S$ & $f(i,s)$ \\ \hline
$i^0$ & $s^0$ & 0.665 \\
$i^0$ & $s^1$ & 0.035 \\
$i^1$ & $s^0$ & 0.06 \\
$i^1$ & $s^1$ & 0.24
\end{tabular}
\end{table}
\item Notice that we would need 3 parameters to specify this joint probability function ($2^{m-1}$).
\item Alternatively, we could use the the chain rule of conditional probabilities to represent the joint probability function as follows:
\begin{displaymath}
f(i,s) = f(i)f(s|i)
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item Other factorizations obtained by changing the order of the variables are also valid (e.g., $f(i,s) = f(s)f(i|s)$).
\item However, it is convenient to represent the process in a way that is more compatible with causality.
\item Various factors (e.g., genetics, upbringing) first determined (stochastically) the student's intelligence.
\item Her/his performance on the SAT is determined (stochastically) by her/his intelligence.
\item We note that the models we construct are not required to follow causal intuitions, but they often do.
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item From a mathematical perspective, $ f(i,s) = f(i)f(s|i)$ leads to an alternative way of representing the joint probability function.
\item Instead of specifying the various joint entries $f(i,s)$, we would specify it in the form of $f(i)$ and $f(s|i)$.
\item Thus, we could represent the joint probability function using two tables.
\item The first one representing the marginal probability function of $I$.
\begin{table}
\centering
\begin{tabular}{cc} \hline
$i^0$ & $i^1$ \\ \hline
0.7 & 0.3
\end{tabular}
\end{table}
\item These numbers can be easily verified from the previous table. For example
\begin{displaymath}
f(i^{0})=f(i^{0},s^0)+f(i^{0},s^1)=0.665+0.035=0.7
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item The second table represents the conditional probability function of $S$ given $I$:
\begin{table}
\centering
\begin{tabular}{c||cc} \hline
$I$ & $s^0$ & $s^1$ \\ \hline
$i^0$ & 0.95 & 0.05 \\
$i^1$ & 0.2 & 0.8 \\
\end{tabular}
\end{table}
\item which can also be verified from previous table using the Bayes theorem. For example:
\begin{displaymath}
f(s^{0}|i^0)=f(s^{0},i^0)/f(i^0)= 0.665/0.7=0.95
\end{displaymath}
\item The conditional probability function $f(s| i)$ represents the probability that the student will succeed on his SATs in the two possible cases:
\begin{enumerate}
\scriptsize{
\item The case where the student's intelligence is low.
\item The case where it is high. }
\end{enumerate}
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item The conditional probability function asserts that a student of low intelligence is extremely unlikely to get a high SAT score ($f(s_1 | i_0 ) = 0.05$).
\item On the other hand, a student of high intelligence is likely, but far from
certain, to get a high SAT score ($f(s_1 | i_1 ) = 0.8$).
\item It is instructive to consider how we could parameterize this alternative representation.
\item Here, we are using three Bernoulli distributions, one for $f(i)$, and two for $f(s | i_0)$ and $f(s|i_1)$.
\item Hence, we can parameterize this representation using three independent parameters, say $\theta_{i^1}$ ,$\theta_{s^1|i^1}$, and $\theta_{s^1|i^0}$
\item Recall that the original representation also required 3 parameters.
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item Thus, although the conditional representation is more natural
than the explicit representation of the joint, it is not more compact.
\item However, as we will soon see, the conditional parameterization provides a basis for our compact representations of more complex distributions.
\item Although we haven't defined directed acyclical graphs (DAGs) yet, it is instructive to see how this example would be represented as one.
\begin{figure}[h!]
\centering
\includegraphics[scale=0.6]{pics/sat1.png}
\end{figure}
\item The DAG from above has a node for each of the two random variables $I$ and $S$, with an edge from $I$ to $S$ representing the direction of the dependence in this model.
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item Let's assume now that the company also has access to the student's grade $G$ in some course.
\item In this case, our probability space is the joint probability function over the
three relevant random variables $I$, $S$, and $G$.
\item We will assume that $I$ and $S$ are as before, and that $G$ takes on three values $g^1, g^2, g^3$, representing the grades A, B, and C, respectively.
\item So, the joint probability function has twelve entries.
\item We can see that for any reasonable joint probability function $f$, there are no independencies that hold.
\item The student's intelligence is clearly correlated both with SAT score and grade.
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item The SAT score and grade are also not independent.
\item A high SAT score increases the chances of getting a high grade.
\item Thus, we may assume that, for our particular probability function $f$,
$f(g^1|s^1) > f(g^1|s^0)$.
\item However, it is quite plausible that our probability function $f$ in this case satisfies a conditional independence property.
\item If we know that the student has high intelligence, a high grade on the SAT no longer gives us information about the student's performance in the class.
\item More formally: $f(g|s,i)=f(g|i)$ or $G \perp S|I$
\item So the joint probability function can be reduced from
\begin{displaymath}
f(i,s,g)=f(i)f(s|i)f(g|i,s)
\end{displaymath}
to
\begin{displaymath}
f(i,s,g)=f(i)f(s|i)f(g|i)
\end{displaymath}
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item Now our joint probability function is specified by $f(i)$, $f(s|i)$, and $f(g|i)$.
\item While $f(i)$, $f(s|i)$ might be the same as before, and $f(g|i)$ might be:
\begin{table}
\centering
\begin{tabular}{c||ccc} \hline
$I$ & $g^1$ & $g^2$ & $g^3$ \\ \hline
$i^0$ & 0.2 & 0.34 & 0.46 \\
$i^1$ & 0.74 & 0.17 & 0.09 \\
\end{tabular}
\end{table}
\item Now we can calculate the joint probability of any combination of inputs using the chain rule of probability.
\item For example:
\begin{align}
f(i^1,s^1,g^2) & = f(i^1)f(s^1|i^1)f(g^2|i^1) \\
& = 0.3*0.8*0.17 \\
& = 0.0408
\end{align}
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item The corresponding DAG would be as follows:
\begin{figure}[h!]
\centering
\includegraphics[scale=0.3]{pics/sat2.png}
\end{figure}
\item In this case, the alternative parameterization is more compact than the joint.
\item We now have three Bernoulli distributions $f(i), f(s | i^1)$ and $f(s | i^0)$, and two three-valued categorical distributions $f(g | i_1 )$ and $f(g | i^0)$.
\item A categorical distribution is a discrete probability distribution that describes the possible results of a random variable that can take on one of K possible categories, with the probability of each category separately specified\footnote{\url{https://en.wikipedia.org/wiki/Categorical_distribution}}.
\end{itemize}
}
\end{frame}
\begin{frame}{The Student Example}
\scriptsize{
\begin{itemize}
\item Each of the Bernoullis requires one independent parameter, and each three-valued categorical requires two independent parameters, for a total of seven.
\item By contrast, our joint probability function has twelve entries, so that eleven independent parameters are required to specify an arbitrary joint probability function over these three variables.
\item It is important to note another advantage of this way of representing the joint: \textbf{modularity}.
\item When we added the new variable $G$, the joint probability function changed entirely.
\item Had we used the explicit representation of the joint, we would have had to write down twelve new numbers.
\item In the factored representation, we could reuse our local probability models for the variables $I$ and $S$, and specify only the probability model for $G$, the conditional probability function $f(G | I)$.
\item This property will turn out to be invaluable in modeling real-world systems.
\end{itemize}
}
\end{frame}
\begin{frame}{Directed Acyclic Graphs (DAGs)}
\scriptsize{
\begin{itemize}
\item We just learned how the chain rule of probability and conditional independence assumptions enable us to obtain a compact representation of a joint probability function.
\item Now we are in a position to introduce DAGs.
\item A directed graph consists of a set of nodes with arrows between some nodes.
\item More formally, a directed graph G consists of a set of vertices V and an edge set E of ordered pairs of vertices.
\item If $(Y, X) \in E$ then there is an arrow pointing from Y to X.
\begin{figure}[h!]
\centering
\includegraphics[scale=0.5]{pics/dag1.png}
\caption{A directed graph with vertices $V = \{X, Y, Z\}$ and edges $E = \{(Y, X), (Y, Z)\}$.}
\end{figure}
\end{itemize}
}
\end{frame}
\begin{frame}{Directed Acyclic Graphs (DAGs)}
\scriptsize{
\begin{itemize}
\item If an arrow connects two variables X and Y (in either direction) we say that X and Y are \textbf{adjacent}.
\item If there is an arrow from X to Y then X is a \textbf{parent} of Y and Y is a \textbf{child} of X.
\item The set of all parents of X is denoted by $\pi_{X}$ or $\pi(X)$.
\item A \textbf{directed path} between two variables is a set of arrows all pointing in the same direction linking one variable to the other such as the chain shown below:
\begin{figure}[h!]
\centering
\includegraphics[scale=0.5]{pics/dag2.png}
\caption{A chain graph with a directed path.}
\end{figure}
\item A sequence of adjacent vertices starting with X and ending with Y but ignoring the
direction of the arrows is called an \textbf{undirected path}.
\item X is an \textbf{ancestor} of Y if there is a directed path from X to Y (or X = Y ).
\item We also say that Y is a \textbf{descendant} of X.
\end{itemize}
}
\end{frame}
\begin{frame}{Directed Acyclic Graphs (DAGs)}
\scriptsize{
\begin{itemize}
\item A directed path that starts and ends at the same variable is called a cycle.
\item A directed graph is acyclic if it has no cycles.
\item In this case we say that the graph is a directed
acyclic graph or DAG.
\begin{figure}[h!]
\centering
\includegraphics[scale=0.12]{pics/cycle.png}
\end{figure}
\item From now on, we only deal with directed acyclic graphs since it is very difficult to provide a coherent probability semantics over graphs with directed cycles.
\end{itemize}
}
\end{frame}
\begin{frame}{Probability and DAGs}
\scriptsize{
\begin{itemize}
\item When a DAG is used to encode a joint probability function $f$ (density or mass) with given conditional independence relations, we have a Directed Graphical Model or Bayesian Network.
\item Warning: It is common to find terms DAG, DGM or Bayesian network used interchangeably in the literature.
\item Let $G$ be a DAG with vertices $V = (X_1 , \dots , X_i, \dots, X_m )$.
\item Each vertex $X_i$ is random variable
\item The edges of $G$ represent (conditional) independence relations between variables.
\item The order of vertices $(1,\dots,i,\dots,m)$ forms a topological sort on the random variables.
\item This means that every variable comes before all its descendants in the graph.
\end{itemize}
}
\end{frame}
\begin{frame}{Probability and DAGs}
\scriptsize{
\begin{itemize}
\item For example, the next figure shows a DAG with four variables, the numbers indicate the topological order.
\begin{figure}[h!]
\centering
\includegraphics[scale=0.4]{pics/dag3.png}
\end{figure}
\item It is clear from the figure, that no vertex comes before any of its descendant in the DAG.
\item Let $f$ be probability function for $V$, $f(x_1,\dots,x_d)$, we say that $G$ represents $f$, if
\begin{equation}
f(x) = \prod_{j=1}^d f(x_j| \pi_{x_j})
\end{equation}
where $\pi_{x_j}$ is the set of parent nodes of $X_j$
\end{itemize}
}
\end{frame}
\begin{frame}{Probability and DAGs}
\scriptsize{
\begin{itemize}
\item Based on the equation above, probability function $f$ of the previous graph takes the following decomposition:
\begin{displaymath}
f(o,s,h,c) = f(o)f(s)f(h|o,s)f(c|s).
\end{displaymath}
\item If no conditional independence assumptions were made, the joint probability function $f$ would the following:
\begin{displaymath}
f(o,s,h,c) = f(o)f(s|o)f(h|o,s)f(c|o,s,h)
\end{displaymath}
\item That means that the graph above encodes the following independence asssumptions:
\begin{enumerate}
\item $f(s|o) = f(s)$
\item $f(c|o,s,h) = f(c|s)$
\end{enumerate}
\end{itemize}
}
\end{frame}
\begin{frame}{Probability and DAGs}
\scriptsize{
\begin{itemize}
\item How can we know all the independence assumptions encoded by a DAG?
\item The answer is \textbf{d-separation}.
\item D-separation is a criterion for deciding, whether a set X of variables is independent of another set Y, given a third set Z.
\item The idea is to associate ``dependence'' with ``connectedness'' (i.e., the existence of a connecting path) and ``independence'' with ``unconnectedness'' or ``separation''.
\item Warning: D-separation is hard to understand.
\item Before trying to understand it we are going to introduce an R library for playing with DAGs: \textbf{Dagitty}.
\end{itemize}
}
\end{frame}
\begin{frame}{Dagitty}
\scriptsize{
\begin{itemize}
\item DAGitty is a browser-based environment for creating, editing, and analyzing DAGs: \url{http://dagitty.net/}.
\begin{figure}[h!]
\centering
\includegraphics[scale=0.2]{pics/DAGitty.png}
\end{figure}
\item The R package ‘dagitty’ provides access to all of the capabilities of the DAGitty web application within R.
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Dagitty}
\scriptsize{
\begin{itemize}
\item We can use Dagitty to create the DAG from before with the command ``dag'':
\begin{verbatim}
> library(dagitty)
>
> over <- dagitty("dag{ o -> h; s -> h; s ->c }")
\end{verbatim}
\item Now, in order to visualize the DAG we need to specify the coordinates of each vertex:
\begin{verbatim}
> coordinates(over) <-
list( x=c(o=0,h=1,s=2,c=3) , y=c(o=0,h=1,s=0,c=1) )
> plot(over)
\end{verbatim}
\begin{figure}[h!]
\centering
\includegraphics[scale=0.4]{pics/overdag1.pdf}
\end{figure}
\end{itemize}
}
\end{frame}
\begin{frame}[fragile]{Dagitty}
\scriptsize{
\begin{itemize}
\item We can obtain the children of a node:
\begin{verbatim}
> children(over,"s")
[1] "c" "h"
\end{verbatim}
\item Or the parents:
\begin{verbatim}
> parents( over, "h" )
[1] "o" "s"
\end{verbatim}
\item Or the undirected paths between two nodes:
\begin{verbatim}
> paths( over, "o", "c" )$path
[1] "o -> h <- s -> c"
\end{verbatim}
\item Or just the directed paths between two nodes:
\begin{verbatim}
> paths( over, "s", "c",directed = T )$path
[1] "s -> c"
\end{verbatim}
\item Finally, we can obtain all the conditional independencies encoded by the DAG: