py-why · adam2392 · Jul 27, 2023 · Jul 27, 2023 · Jul 27, 2023 · Jul 27, 2023
diff --git a/doc/api.rst b/doc/api.rst
@@ -61,6 +61,7 @@ causal graph operations.
    find_connected_pairs
    add_all_snode_combinations
    compute_invariant_domains_per_node
+   remove_snode_edge
 
 Conversions between other package's causal graphs
 =================================================
@@ -163,10 +164,10 @@ linear structural equation Gaussian models (SEMs).
 .. autosummary::
    :toctree: generated/
 
-   make_graph_linear_gaussian
+   make_random_linear_gaussian_graph
    apply_linear_soft_intervention
    set_node_attributes_with_G
-   make_graph_multidomain
+   make_random_multidomain_graph
 
 Visualization of causal graphs
 ==============================

diff --git a/doc/conf.py b/doc/conf.py
@@ -236,6 +236,8 @@ def setup(app):
     "graphviz": ("https://graphviz.readthedocs.io/en/stable/", None),
     "sphinx-gallery": ("https://sphinx-gallery.github.io/stable/", None),
     "pgmpy": ("https://pgmpy.org/", None),
+    "dodiscover": ("https://pywhy.org/dodiscover/dev/", None),
+    "dowhy": ("https://pywhy.org/dowhy/dev/", None),
 }
 intersphinx_timeout = 5
 
@@ -303,6 +305,7 @@ def setup(app):
             "../examples/intro",
             "../examples/visualization",
             "../examples/simulations",
+            "../examples/multiple-domains",
         ]
     ),
     # "filename_pattern": "^((?!sgskip).)*$",

diff --git a/doc/reference/functional/index.rst b/doc/reference/functional/index.rst
@@ -266,7 +266,7 @@ Linear functional graphs
 .. autosummary::
    :toctree: ../../generated/
 
-   make_graph_linear_gaussian
+   make_random_linear_gaussian_graph
    apply_linear_soft_intervention
 
 Multidomain
@@ -306,4 +306,4 @@ Linear functional selection diagrams
 .. autosummary::
    :toctree: ../../generated/
 
-   make_graph_multidomain
+   make_random_multidomain_graph
diff --git a/doc/references.bib b/doc/references.bib
@@ -17,6 +17,14 @@ @article{bareinboim_causal_2016
   pages    = {7345--7352}
 }
 
+@incollection{pearl2022external,
+  title={External validity: From do-calculus to transportability across populations},
+  author={Pearl, Judea and Bareinboim, Elias},
+  booktitle={Probabilistic and causal inference: The works of Judea Pearl},
+  pages={451--482},
+  year={2022}
+}
+
 @article{Colombo2012,
   author    = {Diego Colombo and Marloes H. Maathuis and Markus Kalisch and Thomas S. Richardson},
   title     = {{Learning high-dimensional directed acyclic graphs with latent and selection variables}},
@@ -100,6 +108,14 @@ @article{gerhardus2021characterization
   year    = {2021}
 }
 
+@techreport{li2023discovery,
+  author = "Li, A. and Jaber, A. and Bareinboim, E.",
+  title = "Causal Discovery from Observational and Interventional Data Across Multiple Environments",
+  year = "2023",
+  month = "May",
+  number = "R-98",
+  institution = "Causal Artificial Intelligence Lab, Columbia University"
+}
 
 @inproceedings{Malinsky18a_svarfci,
   title     = {Causal Structure Learning from Multivariate Time Series in Settings with Unmeasured Confounding},

diff --git a/doc/whats_new/v0.1.rst b/doc/whats_new/v0.1.rst
@@ -40,8 +40,8 @@ Changelog
 - |Feature| Implement export/import functions to go to/from pywhy-graphs to pcalg and tetrad, by `Adam Li`_ (:pr:`60`)
 - |Feature| Implement export/import functions to go to/from pywhy-graphs to ananke-causal, by `Jaron Lee`_ (:pr:`63`)
 - |Feature| Implement pre-commit hooks for development, by `Jaron Lee`_ (:pr:`68`)
-- |Feature| Implement a new submodule for converting graphs to a functional model, with :func:`pywhy_graphs.functional.make_graph_linear_gaussian`, by `Adam Li`_ (:pr:`75`)
-- |Feature| Implement a multidomain linear functional graph, with :func:`pywhy_graphs.functional.make_graph_multidomain`, by `Adam Li`_ (:pr:`77`)
+- |Feature| Implement a new submodule for converting graphs to a functional model, with :func:`pywhy_graphs.functional.make_random_linear_gaussian_graph`, by `Adam Li`_ (:pr:`75`)
+- |Feature| Implement a multidomain linear functional graph, with :func:`pywhy_graphs.functional.make_random_multidomain_graph`, by `Adam Li`_ (:pr:`77`)
 - |Feature| Implement and test functions to find inducing paths between two nodes, `Aryan Roy`_ (:pr:`78`)
 - |Feature| Implement general functional API for sampling and generating a functional causal graph,  by `Adam Li`_ (:pr:`82`)
 

diff --git a/doc/whats_new/v0.2.rst b/doc/whats_new/v0.2.rst
@@ -27,6 +27,7 @@ Changelog
 ---------
 - |Feature| Implement and test functions to validate an MAG and check the presence of almost directed cycles, by `Aryan Roy`_ (:pr:`91`)
 - |Feature| Implement and test functions to convert a DAG to MAG, by `Aryan Roy`_ (:pr:`96`)
+- |Feature| Add algorithms for interfacing with a selection diagram in ``pywhy_graphs.algorithms.multidomain``, by `Adam Li`_ (:pr:`88`)
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/examples/multiple-domains/README.txt b/examples/multiple-domains/README.txt
@@ -0,0 +1,4 @@
+Examples Representing Causal Selection Diagrams Over Multiple Domains
+---------------------------------------------------------------------
+
+Examples demonstrating how to represent causal diagrams for multiple domains.
diff --git a/examples/multiple-domains/plot_selection_diagram.py b/examples/multiple-domains/plot_selection_diagram.py
@@ -0,0 +1,147 @@
+"""
+.. _ex-selection-diagrams:
+
+=========================================================
+An introduction to selection diagrams and how to use them
+=========================================================
+
+Selection diagrams are causal graphical objects that allow the user and scientist
+to represent causal models with multiple domains. This is useful for representing
+domain-shifts, generalizability and invariances across different environments.
+For a detailed theoretical introduction to selection diagrams, see
+:footcite:`bareinboim_causal_2016,pearl2022external`.
+
+This is a common problem in machine learning, where the goal is to learn a model
+that generalizes to unseen data. In this case, the unseen data can be a different
+domain, and the model needs to be invariant across domains.
+
+This short example will introduce selection diagrams, and how they are constructed
+and different from regular causal graphs.
+"""
+
+# %%
+# Import the required libraries
+# -----------------------------
+from pprint import pprint
+
+import pywhy_graphs as pg
+from pywhy_graphs.algorithms import compute_invariant_domains_per_node, remove_snode_edge
+from pywhy_graphs.viz import draw
+
+# %%
+# Build a selection diagram
+# -------------------------
+# Let us assume that there are only two domains in our causal model.
+#
+# A selection diagram fundamentally represents two different SCMs that represent
+# the two different domains, but share some common variables and causal structure.
+# Let M1 and M2 represent two different SCMs. Each SCM is a 4-tuple of the functionals,
+# endogenous (observed) variables, exogenous (latent) variables and the probability
+# distribution over the exogenous variables.
+#
+# :math:`M1 = \langle \mathcal{F}, V, U, P(u) \rangle`
+#   .. math::
+#     V = \{W, X, Y, Z\}
+#     P(U) = P(U_W, U_X, U_Y, U_Z)
+#     \mathcal{F} = \begin{cases}
+#           W = f_W(U_W) \\
+#           X = f_X(U_X) \\
+#           Y = f_Y(W, X, U_Y) \\
+#           Z = f_Z(X, Y, U_Z)
+#       \end{cases}
+#
+# :math:`M2 = \langle \mathcal{F'}, V, U', P'(u) \rangle`
+#   .. math::
+#     P(U') = P(U_W', U_X', U_Y', U_Z')
+#     \mathcal{F'} = \begin{cases}
+#           W = f'_W(U_W) \\
+#           X = f'_X(U_X) \\
+#           Y = f'_Y(W, X, U_Y) \\
+#           Z = f'_Z(X, Y, U_Z)
+#       \end{cases}
+#
+# These two SCMs share the same causal structure, but the mechanisms for generating
+# each variable may be different either due to different distributions over the
+# exogenous variables, or different functional forms. The selection diagram encodes
+# this information via an extra node, called the S-node, which represents the possibility
+# of a difference in the data-generating mechanisms for the nodes it points to. The
+# lack of an S-node pointing to a variable indicates that the data-generating mechanism
+# for that variable is the same, or invariant across the two domains. This notion can
+# be extended to N domains, where there are now :math:`\binom{N}{2}` S-nodes.
+#
+# The most general version of a selection diagram allows S-nodes to represent a
+# change in graphical structure. We do not explore that generality in this example,
+# or package :footcite:`pearl2022external`.
+#
+# We will now construct the selection diagram representing the two SCMs above.
+
+# %%
+
+G = pg.AugmentedGraph()
+G.add_edges_from(
+    [
+        ("W", "Y"),
+        ("X", "Y"),
+        ("X", "Z"),
+        ("Y", "Z"),
+    ],
+    edge_type=G.directed_edge_name,
+)
+G.add_s_node(domain_ids=(1, 2), node_changes=["W", "X", "Y", "Z"])
+G.add_s_node(domain_ids=(2, 3), node_changes=["W", "X", "Y", "Z"])
+G.add_s_node(domain_ids=(1, 3), node_changes=["W", "X", "Y", "Z"])
+
+draw(G)
+
+# %%
+# Imposing cross-domain invariances
+# ---------------------------------
+# The selection diagram above allows for the possibility of different data-generating
+# mechanisms for each variable. Currently, the S-nodes points to every single
+# node in the graph. Therefore, there is no invariance across domains. Simply put,
+# the data-generating mechanisms for each variable can be different across domains.
+#
+# However, we may want to impose invariances across domains 1 and 2 for the variables
+# W and X. This can be done by removing the S-node pointing to W and X corresponding
+# to domain 1 and 2.
+
+# first, get the mapping from domain ids to s-nodes
+domain_id_to_s_node = G.domain_ids_to_snodes
+
+# remove the edge S^{1, 2} -> W
+G = remove_snode_edge(G, domain_id_to_s_node[frozenset([1, 2])], "W")
+G = remove_snode_edge(G, domain_id_to_s_node[frozenset([1, 2])], "X")
+
+draw(G)
+
+# let's explicitly compute the invariant domains per node
+G = compute_invariant_domains_per_node(G, "W")
+pprint(G.nodes(data=True))
+
+# %%
+# Consistency in cross-domain invariances
+# ---------------------------------------
+# In :footcite:`li2023discovery`, it is noted that there may be inconsistencies
+# when removing S-node edges. For example, if we remove the edge S^{1, 2} -> W,
+# and then remove the edge S^{2, 3} -> W, then we should have removed the
+# edge S^{1, 3} -> W. This is because the invariances are transitive. In pywhy-graphs,
+# we have a function that automatically checks for these inconsistencies and removes them.
+# The :func:`pywhy_graphs.algorithms.remove_snode_edge` function automatically does this.
+
+G = remove_snode_edge(G, domain_id_to_s_node[frozenset([2, 3])], "W")
+
+# now the S-node edge corresponding to S^{1, 3} -> W should be removed as well
+draw(G)
+
+# %%
+# Summary
+# -------
+# In this example, we have seen how to construct a selection diagram. We have also
+# seen how to model invariances across domains using S-nodes and the lack of S-node edges
+# to certain nodes in the graph.
+
+
+# %%
+# References
+# ----------
+# .. footbibliography::
diff --git a/examples/simulations/README.txt b/examples/simulations/README.txt
@@ -0,0 +1,5 @@
+Examples Simulating Data From Causal Diagrams
+---------------------------------------------
+
+Examples demonstrating how to simulate data stemming from causal diagrams in a variety of different
+settings.
diff --git a/examples/simulations/plot_discrete_causal_bayesian_network.py b/examples/simulations/plot_discrete_causal_bayesian_network.py
@@ -0,0 +1,108 @@
+"""
+.. _ex-discrete-cbn:
+
+=============================================================
+Discrete Causal Bayesian Networks and Simulated Discrete Data
+=============================================================
+
+Discrete data arises commonly in many applications. For example, data is typically stored
+in a table, which categorizes values into discrete values representing certain categories.
+In survey data, answers are typically multiple choice. In medical settings, many symptoms
+are rated on a scale of for example 1-5. Perhaps a data feature indicates whether or not a
+certain disease is present or not, resulting in a binary variable.
+
+Even if these are discrete, the data is still generated by some unknown structural causal model,
+which induces a causal diagram. Causal algorithms will typically need to generate different
+causal models and then datasets with varying sample sizes to evaluate the algorithms.
+
+In this example, we illustrate how to generate discrete data from a random causal graph.
+
+For information on generating continuous data from a causal graph, one can see
+:ref:`ex-linear-gaussian-graph`.
+"""
+
+# %%
+# Import the required libraries
+# -----------------------------
+import networkx as nx
+from pgmpy.factors.discrete.CPD import TabularCPD
+
+from pywhy_graphs.functional import sample_from_graph
+from pywhy_graphs.functional.discrete import make_random_discrete_graph
+from pywhy_graphs.viz import draw
+
+
+# define a helper function to print the full CPD
+def print_full(cpd):
+    backup = TabularCPD._truncate_strtable
+    TabularCPD._truncate_strtable = lambda self, x: x
+    print(cpd)
+    TabularCPD._truncate_strtable = backup
+
+
+# %%
+# Construct the causal graph
+# --------------------------
+# In order to generate the data, we start from a causal graph that informs us how
+# data is generated. That is, each variable is a function of its exogenous noise distribution
+# and its parent values.
+edge_list = [
+    ("A", "B"),
+    ("B", "C"),
+    ("C", "D"),
+    ("B", "D"),
+    ("X", "A"),
+    ("X", "C"),
+    ("C", "W"),
+]
+G = nx.DiGraph()
+
+G.add_edges_from(edge_list)
+
+draw(G)
+
+# %%
+# Define functional relationships on the graph
+# --------------------------------------------
+# In order to generate data, we need to define the full functional relationship
+# between every node and its parents and also how to generate parent-less nodes.
+# We leverage the :class:`pgmpy.factors.discrete.CPD.TabularCPD` abstraction to
+# represent conditional probability distributions as tables.
+
+cardinality_lims = {node: [2, 4] for node in G.nodes}
+weight_lims = {node: [1, 100] for node in G.nodes}
+noise_ratio_lims = {node: [0.1, 0.1] for node in G.nodes}
+seed = 1234
+
+G = make_random_discrete_graph(
+    G,
+    cardinality_lims=cardinality_lims,
+    weight_lims=weight_lims,
+    noise_ratio_lims=noise_ratio_lims,
+    random_state=seed,
+    overwrite=True,
+)
+
+print(G)
+
+# we can extract the conditional probability table for each node, which is a function of its parents
+node_dict = G.nodes["C"]
+
+# We see that each node is fully defined given a conditional probability table, stored as a node
+# attribute under the keyword 'cpd'. For more information on the CPD object, see
+# pgmpy's documentation on :class:`pgmpy.factors.discrete.CPD.TabularCPD`. Note this
+# is in contrast with what node attributes are required in general for simulating data
+# from a causal graph in pywhy-graphs.
+print_full(node_dict["cpd"])
+
+# %%
+# Sample data from the graph
+# --------------------------
+# Now, we can sample data from the graph that is generated according to the causal diagram.
+# This data can be used for instance to evaluate causal discovery algorithms from
+# :mod:`dodiscover`, or causal estimation algorithms from :mod:`dowhy`.
+
+# now we sample from the graph the discrete dataset
+df = sample_from_graph(G, n_samples=2000, n_jobs=1, random_state=seed)
+
+print(df.head())
diff --git a/examples/simulations/plot_graphs_with_interventions.py b/examples/simulations/plot_graphs_with_interventions.py
diff --git a/examples/simulations/plot_linear_gaussian_causal_graph.py b/examples/simulations/plot_linear_gaussian_causal_graph.py
@@ -0,0 +1,21 @@
+"""
+.. _ex-linear-gaussian-graph:
+
+=====================================================
+Linear Gaussian Graphs and Generating Continuous Data
+=====================================================
+
+Linear gaussian graphs are an important model. These are joint distributions
+that follow the structure of a causal graph, where exogenous noise distributions are Gaussian
+and nodes are linear combinations of their parents perturbed by the exogenous variable.
+
+Thus, each edge is associated with a weight of how the parent node is added to the current
+node.
+
+In this example, we illustrate how to generate continuous data from a linear gaussian
+causal graph.
+
+For information on generating discrete data from a causal graph, one can see
+:ref:`ex-discrete-cbn`. Consider reading the user-guide, :ref:`functional-causal-graphical-models`
+to understand how an arbitrary functional relationships are encoded in a causal graph.
+"""