IHDP 因果效应估计

DoWhy example on ihdp (Infant Health and Development Program) dataset

[1]:
# importing required libraries
import os, sys
sys.path.append(os.path.abspath("../../../"))
import dowhy
from dowhy import CausalModel
import pandas as pd
import numpy as np
[2]:
# Loading Data

data= pd.read_csv("https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv", header = None)
col =  ["treatment", "y_factual", "y_cfactual", "mu0", "mu1" ,]
for i in range(1,26):
    col.append("x"+str(i))
data.columns = col
data = data.astype({"treatment":'bool'}, copy=False)
data.head()
---------------------------------------------------------------------------
ConnectionResetError                      Traceback (most recent call last)
~/opt/anaconda3/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1316                 h.request(req.get_method(), req.selector, req.data, headers,
-> 1317                           encode_chunked=req.has_header('Transfer-encoding'))
   1318             except OSError as err: # timeout error

~/opt/anaconda3/lib/python3.7/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1243         """Send a complete request to the server."""
-> 1244         self._send_request(method, url, body, headers, encode_chunked)
   1245

~/opt/anaconda3/lib/python3.7/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1289             body = _encode(body, 'body')
-> 1290         self.endheaders(body, encode_chunked=encode_chunked)
   1291

~/opt/anaconda3/lib/python3.7/http/client.py in endheaders(self, message_body, encode_chunked)
   1238             raise CannotSendHeader()
-> 1239         self._send_output(message_body, encode_chunked=encode_chunked)
   1240

~/opt/anaconda3/lib/python3.7/http/client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027

~/opt/anaconda3/lib/python3.7/http/client.py in send(self, data)
    965             if self.auto_open:
--> 966                 self.connect()
    967             else:

~/opt/anaconda3/lib/python3.7/http/client.py in connect(self)
   1413             self.sock = self._context.wrap_socket(self.sock,
-> 1414                                                   server_hostname=server_hostname)
   1415

~/opt/anaconda3/lib/python3.7/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
    422             context=self,
--> 423             session=session
    424         )

~/opt/anaconda3/lib/python3.7/ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
    869                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 870                     self.do_handshake()
    871             except (OSError, ValueError):

~/opt/anaconda3/lib/python3.7/ssl.py in do_handshake(self, block)
   1138                 self.settimeout(None)
-> 1139             self._sslobj.do_handshake()
   1140         finally:

ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-2-f11c2ccb6294> in <module>
      1 # Loading Data
      2
----> 3 data= pd.read_csv("https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv", header = None)
      4 col =  ["treatment", "y_factual", "y_cfactual", "mu0", "mu1" ,]
      5 for i in range(1,26):

~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684
--> 685         return _read(filepath_or_buffer, kwds)
    686
    687     parser_f.__name__ = name

~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    438     # See https://github.com/python/mypy/issues/1297
    439     fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
--> 440         filepath_or_buffer, encoding, compression
    441     )
    442     kwds["compression"] = compression

~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
    194
    195     if _is_url(filepath_or_buffer):
--> 196         req = urlopen(filepath_or_buffer)
    197         content_encoding = req.headers.get("Content-Encoding", None)
    198         if content_encoding == "gzip":

~/opt/anaconda3/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223
    224 def install_opener(opener):

~/opt/anaconda3/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
    523             req = meth(req)
    524
--> 525         response = self._open(req, data)
    526
    527         # post-process response

~/opt/anaconda3/lib/python3.7/urllib/request.py in _open(self, req, data)
    541         protocol = req.type
    542         result = self._call_chain(self.handle_open, protocol, protocol +
--> 543                                   '_open', req)
    544         if result:
    545             return result

~/opt/anaconda3/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    501         for handler in handlers:
    502             func = getattr(handler, meth_name)
--> 503             result = func(*args)
    504             if result is not None:
    505                 return result

~/opt/anaconda3/lib/python3.7/urllib/request.py in https_open(self, req)
   1358         def https_open(self, req):
   1359             return self.do_open(http.client.HTTPSConnection, req,
-> 1360                 context=self._context, check_hostname=self._check_hostname)
   1361
   1362         https_request = AbstractHTTPHandler.do_request_

~/opt/anaconda3/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1317                           encode_chunked=req.has_header('Transfer-encoding'))
   1318             except OSError as err: # timeout error
-> 1319                 raise URLError(err)
   1320             r = h.getresponse()
   1321         except:

URLError: <urlopen error [Errno 54] Connection reset by peer>

1.Model

[3]:
# Create a causal model from the data and given common causes.
xs = ""
for i in range(1,26):
    xs += ("x"+str(i)+"+")

model=CausalModel(
        data = data,
        treatment='treatment',
        outcome='y_factual',
        common_causes=xs.split('+')
        )

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-0ca3873b3459> in <module>
      5
      6 model=CausalModel(
----> 7         data = data,
      8         treatment='treatment',
      9         outcome='y_factual',

NameError: name 'data' is not defined

2.Identify

[4]:
#Identify the causal effect
identified_estimand = model.identify_effect()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-7f4cfa782ba8> in <module>
      1 #Identify the causal effect
----> 2 identified_estimand = model.identify_effect()

NameError: name 'model' is not defined

3. Estimate (using different methods)

3.1 Using Linear Regression

[5]:
# Estimate the causal effect and compare it with Average Treatment Effect
estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.linear_regression", test_significance=True
)

print(estimate)

print("Causal Estimate is " + str(estimate.value))
data_1 = data[data["treatment"]==1]
data_0 = data[data["treatment"]==0]

print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-8370db6b49dd> in <module>
      1 # Estimate the causal effect and compare it with Average Treatment Effect
----> 2 estimate = model.estimate_effect(identified_estimand,
      3         method_name="backdoor.linear_regression", test_significance=True
      4 )
      5

NameError: name 'model' is not defined

3.2 Using Propensity Score Matching

[6]:
estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_matching"
)

print("Causal Estimate is " + str(estimate.value))

print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-e048d3331cdd> in <module>
----> 1 estimate = model.estimate_effect(identified_estimand,
      2         method_name="backdoor.propensity_score_matching"
      3 )
      4
      5 print("Causal Estimate is " + str(estimate.value))

NameError: name 'model' is not defined

3.3 Using Propensity Score Stratification

[7]:
estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_stratification", method_params={'num_strata':50, 'clipping_threshold':5}
)

print("Causal Estimate is " + str(estimate.value))
print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-b3d22e2b80da> in <module>
----> 1 estimate = model.estimate_effect(identified_estimand,
      2         method_name="backdoor.propensity_score_stratification", method_params={'num_strata':50, 'clipping_threshold':5}
      3 )
      4
      5 print("Causal Estimate is " + str(estimate.value))

NameError: name 'model' is not defined

3.4 Using Propensity Score Weighting

[8]:
estimate = model.estimate_effect(identified_estimand,
        method_name="backdoor.propensity_score_weighting"
)

print("Causal Estimate is " + str(estimate.value))

print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-29f974873ddf> in <module>
----> 1 estimate = model.estimate_effect(identified_estimand,
      2         method_name="backdoor.propensity_score_weighting"
      3 )
      4
      5 print("Causal Estimate is " + str(estimate.value))

NameError: name 'model' is not defined

4. Refute

Refute the obtained estimate using multiple robustness checks.

4.1 Adding a random common cause

[9]:
refute_results=model.refute_estimate(identified_estimand, estimate,
        method_name="random_common_cause")
print(refute_results)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-d9e65b870ed1> in <module>
----> 1 refute_results=model.refute_estimate(identified_estimand, estimate,
      2         method_name="random_common_cause")
      3 print(refute_results)

NameError: name 'model' is not defined

4.2 Using a placebo treatment

[10]:
res_placebo=model.refute_estimate(identified_estimand, estimate,
        method_name="placebo_treatment_refuter", placebo_type="permute")
print(res_placebo)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-35ea45ecbbe4> in <module>
----> 1 res_placebo=model.refute_estimate(identified_estimand, estimate,
      2         method_name="placebo_treatment_refuter", placebo_type="permute")
      3 print(res_placebo)

NameError: name 'model' is not defined

4.3 Data Subset Refuter

[11]:
res_subset=model.refute_estimate(identified_estimand, estimate,
        method_name="data_subset_refuter", subset_fraction=0.9)
print(res_subset)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-a7d2075561d8> in <module>
----> 1 res_subset=model.refute_estimate(identified_estimand, estimate,
      2         method_name="data_subset_refuter", subset_fraction=0.9)
      3 print(res_subset)

NameError: name 'model' is not defined