IHDP 因果效应估计¶
DoWhy example on ihdp (Infant Health and Development Program) dataset
[1]:
# importing required libraries
import os, sys
sys.path.append(os.path.abspath("../../../"))
import dowhy
from dowhy import CausalModel
import pandas as pd
import numpy as np
[2]:
# Loading Data
data= pd.read_csv("https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv", header = None)
col = ["treatment", "y_factual", "y_cfactual", "mu0", "mu1" ,]
for i in range(1,26):
col.append("x"+str(i))
data.columns = col
data = data.astype({"treatment":'bool'}, copy=False)
data.head()
---------------------------------------------------------------------------
ConnectionResetError Traceback (most recent call last)
~/opt/anaconda3/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1316 h.request(req.get_method(), req.selector, req.data, headers,
-> 1317 encode_chunked=req.has_header('Transfer-encoding'))
1318 except OSError as err: # timeout error
~/opt/anaconda3/lib/python3.7/http/client.py in request(self, method, url, body, headers, encode_chunked)
1243 """Send a complete request to the server."""
-> 1244 self._send_request(method, url, body, headers, encode_chunked)
1245
~/opt/anaconda3/lib/python3.7/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
1289 body = _encode(body, 'body')
-> 1290 self.endheaders(body, encode_chunked=encode_chunked)
1291
~/opt/anaconda3/lib/python3.7/http/client.py in endheaders(self, message_body, encode_chunked)
1238 raise CannotSendHeader()
-> 1239 self._send_output(message_body, encode_chunked=encode_chunked)
1240
~/opt/anaconda3/lib/python3.7/http/client.py in _send_output(self, message_body, encode_chunked)
1025 del self._buffer[:]
-> 1026 self.send(msg)
1027
~/opt/anaconda3/lib/python3.7/http/client.py in send(self, data)
965 if self.auto_open:
--> 966 self.connect()
967 else:
~/opt/anaconda3/lib/python3.7/http/client.py in connect(self)
1413 self.sock = self._context.wrap_socket(self.sock,
-> 1414 server_hostname=server_hostname)
1415
~/opt/anaconda3/lib/python3.7/ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
422 context=self,
--> 423 session=session
424 )
~/opt/anaconda3/lib/python3.7/ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
869 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
--> 870 self.do_handshake()
871 except (OSError, ValueError):
~/opt/anaconda3/lib/python3.7/ssl.py in do_handshake(self, block)
1138 self.settimeout(None)
-> 1139 self._sslobj.do_handshake()
1140 finally:
ConnectionResetError: [Errno 54] Connection reset by peer
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
<ipython-input-2-f11c2ccb6294> in <module>
1 # Loading Data
2
----> 3 data= pd.read_csv("https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv", header = None)
4 col = ["treatment", "y_factual", "y_cfactual", "mu0", "mu1" ,]
5 for i in range(1,26):
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
683 )
684
--> 685 return _read(filepath_or_buffer, kwds)
686
687 parser_f.__name__ = name
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
438 # See https://github.com/python/mypy/issues/1297
439 fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
--> 440 filepath_or_buffer, encoding, compression
441 )
442 kwds["compression"] = compression
~/opt/anaconda3/lib/python3.7/site-packages/pandas/io/common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
194
195 if _is_url(filepath_or_buffer):
--> 196 req = urlopen(filepath_or_buffer)
197 content_encoding = req.headers.get("Content-Encoding", None)
198 if content_encoding == "gzip":
~/opt/anaconda3/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~/opt/anaconda3/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
523 req = meth(req)
524
--> 525 response = self._open(req, data)
526
527 # post-process response
~/opt/anaconda3/lib/python3.7/urllib/request.py in _open(self, req, data)
541 protocol = req.type
542 result = self._call_chain(self.handle_open, protocol, protocol +
--> 543 '_open', req)
544 if result:
545 return result
~/opt/anaconda3/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~/opt/anaconda3/lib/python3.7/urllib/request.py in https_open(self, req)
1358 def https_open(self, req):
1359 return self.do_open(http.client.HTTPSConnection, req,
-> 1360 context=self._context, check_hostname=self._check_hostname)
1361
1362 https_request = AbstractHTTPHandler.do_request_
~/opt/anaconda3/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1317 encode_chunked=req.has_header('Transfer-encoding'))
1318 except OSError as err: # timeout error
-> 1319 raise URLError(err)
1320 r = h.getresponse()
1321 except:
URLError: <urlopen error [Errno 54] Connection reset by peer>
1.Model¶
[3]:
# Create a causal model from the data and given common causes.
xs = ""
for i in range(1,26):
xs += ("x"+str(i)+"+")
model=CausalModel(
data = data,
treatment='treatment',
outcome='y_factual',
common_causes=xs.split('+')
)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-3-0ca3873b3459> in <module>
5
6 model=CausalModel(
----> 7 data = data,
8 treatment='treatment',
9 outcome='y_factual',
NameError: name 'data' is not defined
2.Identify¶
[4]:
#Identify the causal effect
identified_estimand = model.identify_effect()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-4-7f4cfa782ba8> in <module>
1 #Identify the causal effect
----> 2 identified_estimand = model.identify_effect()
NameError: name 'model' is not defined
3. Estimate (using different methods)¶
3.1 Using Linear Regression¶
[5]:
# Estimate the causal effect and compare it with Average Treatment Effect
estimate = model.estimate_effect(identified_estimand,
method_name="backdoor.linear_regression", test_significance=True
)
print(estimate)
print("Causal Estimate is " + str(estimate.value))
data_1 = data[data["treatment"]==1]
data_0 = data[data["treatment"]==0]
print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-5-8370db6b49dd> in <module>
1 # Estimate the causal effect and compare it with Average Treatment Effect
----> 2 estimate = model.estimate_effect(identified_estimand,
3 method_name="backdoor.linear_regression", test_significance=True
4 )
5
NameError: name 'model' is not defined
3.2 Using Propensity Score Matching¶
[6]:
estimate = model.estimate_effect(identified_estimand,
method_name="backdoor.propensity_score_matching"
)
print("Causal Estimate is " + str(estimate.value))
print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-6-e048d3331cdd> in <module>
----> 1 estimate = model.estimate_effect(identified_estimand,
2 method_name="backdoor.propensity_score_matching"
3 )
4
5 print("Causal Estimate is " + str(estimate.value))
NameError: name 'model' is not defined
3.3 Using Propensity Score Stratification¶
[7]:
estimate = model.estimate_effect(identified_estimand,
method_name="backdoor.propensity_score_stratification", method_params={'num_strata':50, 'clipping_threshold':5}
)
print("Causal Estimate is " + str(estimate.value))
print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-7-b3d22e2b80da> in <module>
----> 1 estimate = model.estimate_effect(identified_estimand,
2 method_name="backdoor.propensity_score_stratification", method_params={'num_strata':50, 'clipping_threshold':5}
3 )
4
5 print("Causal Estimate is " + str(estimate.value))
NameError: name 'model' is not defined
3.4 Using Propensity Score Weighting¶
[8]:
estimate = model.estimate_effect(identified_estimand,
method_name="backdoor.propensity_score_weighting"
)
print("Causal Estimate is " + str(estimate.value))
print("ATE", np.mean(data_1["y_factual"])- np.mean(data_0["y_factual"]))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-8-29f974873ddf> in <module>
----> 1 estimate = model.estimate_effect(identified_estimand,
2 method_name="backdoor.propensity_score_weighting"
3 )
4
5 print("Causal Estimate is " + str(estimate.value))
NameError: name 'model' is not defined
4. Refute¶
Refute the obtained estimate using multiple robustness checks.
4.1 Adding a random common cause¶
[9]:
refute_results=model.refute_estimate(identified_estimand, estimate,
method_name="random_common_cause")
print(refute_results)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-9-d9e65b870ed1> in <module>
----> 1 refute_results=model.refute_estimate(identified_estimand, estimate,
2 method_name="random_common_cause")
3 print(refute_results)
NameError: name 'model' is not defined
4.2 Using a placebo treatment¶
[10]:
res_placebo=model.refute_estimate(identified_estimand, estimate,
method_name="placebo_treatment_refuter", placebo_type="permute")
print(res_placebo)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-10-35ea45ecbbe4> in <module>
----> 1 res_placebo=model.refute_estimate(identified_estimand, estimate,
2 method_name="placebo_treatment_refuter", placebo_type="permute")
3 print(res_placebo)
NameError: name 'model' is not defined
4.3 Data Subset Refuter¶
[11]:
res_subset=model.refute_estimate(identified_estimand, estimate,
method_name="data_subset_refuter", subset_fraction=0.9)
print(res_subset)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-11-a7d2075561d8> in <module>
----> 1 res_subset=model.refute_estimate(identified_estimand, estimate,
2 method_name="data_subset_refuter", subset_fraction=0.9)
3 print(res_subset)
NameError: name 'model' is not defined