Source code for lingam.lim

"""
Python implementation of the LiNGAM algorithms.
The LiNGAM Project: https://sites.google.com/view/sshimizu06/lingam
"""

import numpy as np
import scipy.optimize as sopt
from scipy.special import expit as sigmoid
# import time
import pandas as pd
import networkx as nx
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import log_loss
# from Combining import likelihood
# import random
# import itertools
from itertools import product # , combinations, chain
import lingam.utils as ut


[docs]class LiM: """Implementation of LiM Algorithm [1]_ References ---------- .. [1] Zeng Y, Shimizu S, Matsui H, et al. Causal discovery for linear mixed data[C]//Conference on Causal Learning and Rea- soning. PMLR, 2022: 994-1009. """
[docs] def __init__( self, lambda1=0.1, loss_type='mixed', max_iter=150, h_tol=1e-8, rho_max=1e+16, w_threshold=0.1, ): """Construct a LiM model. Parameters ---------- lambda1 : float, optional (default=0.1) L1 penalty parameter. loss_type : str, (default='mixed') Type of distribution of the noise. max_iter : int, (default=150) Maximum number of dual ascent steps. h_tol : float, (default=1e-8) Tolerance parameter of the acyclicity constraint. rho_max : float, (default=1e+16) Maximum value of the regularization parameter rho. w_threshold : float (default=0.1) Drop edge if the weight btw. variables is less than w_threshold. """ self._lambda1 = lambda1 self._loss_type = loss_type self._max_iter = max_iter self._h_tol = h_tol self._rho_max = rho_max self._w_threshold = w_threshold self._adjacency_matrix = None
[docs] def fit(self, X, dis_con): """Fit the model to X with mixed data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where ``n_samples`` is the number of samples and ``n_features`` is the number of observed variables. dis_con: array-like, shape (1, n_features) Indicators of discrete or continuous variables, where "1" indicates a continuous variable, while "0" a discrete variable. Returns ------- self : object Returns the instance of self. """ W_min_lss = self._estimate_LiM(X, dis_con) self._adjacency_matrix = W_min_lss return self._adjacency_matrix
def _estimate_LiM(self, X, dis_con): """Estimate the adjacency matrix btw. mixed variables""" def _loss(W): """Evaluate value and gradient of loss.""" M = X @ W if self._loss_type == 'logistic': loss = 1.0 / X.shape[0] * (np.logaddexp(0, M) - X * M).sum() G_loss = 1.0 / X.shape[0] * X.T @ (sigmoid(M) - X) elif self._loss_type == 'laplace': R = X - M loss = - 1.0 / X.shape[0] * np.sum(- np.log(np.cosh(R))) G_loss = 1.0 / X.shape[0] * X.T @ np.tanh(R) elif self._loss_type == 'mixed': R = X - M a1, a2 = 1, 1 loss_dis = ((np.logaddexp(0, M) - X * M) * np.absolute(dis_con - 1)).sum() loss_con = np.sum(-np.log(np.cosh(R)) * dis_con) loss = -1.0 / X.shape[0] * (a1 * loss_dis + a2 * loss_con) W_dis = np.zeros([d, d]) W_con = np.zeros([d, d]) for ii in np.where(dis_con[0, :] == 0): W_dis[[ii], :] = 1 W_dis[:, [ii]] = 1 for jj in np.where(dis_con[0, :] == 1): W_con[[jj], :] = 1 W_con[:, [jj]] = 1 G_dis = X.T @ (sigmoid(M) - X) * W_dis G_con = - X.T @ np.tanh(R) * W_con G_loss = 1.0 / X.shape[0] * (G_dis + G_con) elif self._loss_type == 'mixed_dag': dag = nx.DiGraph(W) lingam_data = np.transpose(X) df = pd.DataFrame(X) is_continuous = dis_con[0, :].astype(bool) is_discrete = np.invert(is_continuous) loss = - _bic_scoring(dag, is_discrete, df, lingam_data) # lingam_data:dims*samples R = X - M G_loss = 1.0 / X.shape[0] * (X.T @ (sigmoid(M) - X) - X.T @ np.tanh(R)) else: raise ValueError('unknown loss type') return loss, G_loss def _bic_scoring(dag: nx.DiGraph, is_discrete, df, lingam_data): """Evaluate value of loss given the DAG.""" sample_size = df.shape[0] K = dag.number_of_edges() + dag.number_of_nodes() # K = dag.number_of_edges() + np.sum(is_discrete) penalty = np.log(sample_size) / 2 * K total_score = 0.0 - penalty for i in dag.nodes: parents_i = [j for j in dag.predecessors(i)] # 離散 if is_discrete[i]: if not parents_i: # Bernoulli binary variable, likelihood using Bernoulli model with MLE parameters frequency_table = df[i].value_counts() likekihood_bernoulli = 0.0 for count_k in frequency_table: likekihood_bernoulli += count_k * (np.log(count_k) - np.log(frequency_table.sum())) total_score += likekihood_bernoulli elif parents_i: # Logistic binary variable, likelihood using logistic regression model. X = df[parents_i] y = df[i] logistic = LogisticRegression(solver='lbfgs') # or solver='liblinear' logistic.fit(X, y) predict_prob = logistic.predict_proba(X) # Negative cross-entropy loss a.k.a log-likelihood likekihood_logistic = -log_loss(y_true=y, y_pred=predict_prob, normalize=False) total_score += likekihood_logistic pass # 連続 elif not is_discrete[i]: b_i = np.zeros(dag.number_of_nodes()) if parents_i: # estimate b_i X = df[parents_i] y = df[i] lr = LinearRegression(fit_intercept=True) # because zero means lr.fit(X, y) for index, j in enumerate(parents_i): b_i[j] = lr.coef_[index] bi_0 = lr.intercept_ else: bi_0 = np.mean(df[i]) likekihood_lingam = ut.likelihood_i(lingam_data, i, b_i, bi_0) total_score += likekihood_lingam pass return total_score def _h(W): """Evaluate value and gradient of acyclicity constraint.""" # E = slin.expm(W * W) # (Zheng et al. 2018) # h = np.trace(E) - d # A different formulation, slightly faster at the cost of numerical stability M = np.eye(d) + W * W / d # (Yu et al. 2019) G_h = np.linalg.matrix_power(M, d - 1) h = (G_h.T * M).sum() - d return h, G_h def _adj(w): """Convert doubled variables ([2 d^2] array) back to original variables ([d, d] matrix).""" return (w[:d * d] - w[d * d:]).reshape([d, d]) def _func(w): """Evaluate value and gradient of augmented Lagrangian for doubled variables ([2 d^2] array).""" W = _adj(w) loss, G_loss = _loss(W) h, G_h = _h(W) obj = loss + 0.5 * rho * h * h + alpha * h + self._lambda1 * w.sum() # G_smooth = G_loss + (rho * h + alpha) * G_h # Zheng Xun Dag 2018 G_smooth = G_loss + (rho * h + alpha) * G_h.T * W * 2 # 2019 g_obj = np.concatenate((G_smooth + self._lambda1, - G_smooth + self._lambda1), axis=None) return obj, g_obj n, d = X.shape w_est, rho, alpha, h = np.random.random(2 * d * d), 1.0, 0.0, np.inf # double w_est into (w_pos, w_neg) bnds = [(0, 0) if i == j else (0, None) for _ in range(2) for i in range(d) for j in range(d)] # if self._loss_type == 'l2': # X = X - np.mean(X, axis=0, keepdims=True) for _ in range(self._max_iter): w_new, h_new = None, None while rho < self._rho_max: sol = sopt.minimize(_func, w_est, method='L-BFGS-B', jac=True, bounds=bnds) # print('--- One iteration passed.....', sol.fun) w_new = sol.x h_new, _ = _h(_adj(w_new)) if h_new >= 0.25 * h: rho *= 10 else: break w_est, h = w_new, h_new alpha += rho * h # print('------- rho is:', rho) if h <= self._h_tol and h != 0: break if rho >= self._rho_max * 1e-6 and h > 1e+05: # avoid the full graph w_est = np.random.random(2 * d * d) rho = 1.0 elif rho >= self._rho_max: break if np.sum(np.absolute(_adj(w_est))) < 0.09: # avoid the zero matrix w_est = np.random.random(2 * d * d) W_est = _adj(w_est) W_est[np.abs(W_est) < self._w_threshold] = 0 # print('W_est (without the 2nd phase) is: \n', W_est) self._loss_type = 'mixed_dag' aa, aaa = _loss(W_est) # loss I = np.where(W_est != 0) # check directions W_min_lss = np.copy(W_est) candi_setting = list(product(range(2), repeat=len(I[0]))) # 1:reverse the direction; 0:unchanged for candi_setting_i in range(1, len(candi_setting)): W_tmp = np.copy(W_est) for iii in range(len(I[0])): # transform to W_tmp if candi_setting[candi_setting_i][iii] == 1: W_tmp[I[0][iii], I[1][iii]] = 0 W_tmp[I[1][iii], I[0][iii]] = 1 lss, lss_G = _loss(W_tmp) if lss < aa and _h(W_tmp)[0] < self._h_tol: W_min_lss = np.copy(W_tmp) aa = lss # prune process if d > 2 and len(I[0]) > (d - 1): # > min_edge W0 = np.copy(W_min_lss) I_delete = np.where(W_min_lss != 0) for delete_i in range(len(I_delete[0])): W_tmp = np.copy(W0) W_tmp[I_delete[0][delete_i], I_delete[1][delete_i]] = 0 lss, lss_G = _loss(W_tmp) if lss < aa and _h(W_tmp)[0] < self._h_tol: W_min_lss = np.copy(W_tmp) aa = lss # if not np.all(W_est.astype(bool) == W_min_lss.astype(bool)): # if they are different W0 = np.copy(W_est) for delete_i in range(len(I[0])): W_tmp = np.copy(W0) W_tmp[I[0][delete_i], I[1][delete_i]] = 0 lss, lss_G = _loss(W_tmp) if lss < aa and _h(W_tmp)[0] < self._h_tol: W_min_lss = np.copy(W_tmp) aa = lss # add process if d > 2 and len(I[0]) < (d * (d - 1) / 2): W0 = np.copy(W_min_lss) W_edges = W0 + W0.T + np.eye(d) I_add = np.where(W_edges == 0) # add undirected edges' indices for add_i in range(len(I_add[0])): W_tmp = np.copy(W0) W_tmp[I_add[0][add_i], I_add[1][add_i]] = 1 lss, lss_G = _loss(W_tmp) if lss < aa and _h(W_tmp)[0] < self._h_tol: W_min_lss = np.copy(W_tmp) aa = lss # if not np.all(W_est.astype(bool) == W_min_lss.astype(bool)): # if they are different W0 = np.copy(W_est) W_edges = W0 + W0.T + np.eye(d) I_add = np.where(W_edges == 0) # add undirected edges' indices for add_i in range(len(I_add[0])): W_tmp = np.copy(W0) W_tmp[I_add[0][add_i], I_add[1][add_i]] = 1 lss, lss_G = _loss(W_tmp) if lss < aa and _h(W_tmp)[0] < self._h_tol: W_min_lss = np.copy(W_tmp) aa = lss # print('W_min_lss is:\n', W_min_lss) # print('W_true is:\n', W_true) return W_min_lss @property def adjacency_matrix_(self): """Estimated adjacency matrix between mixed variables. Returns ------- adjacency_matrix_ : array-like, shape (n_features, n_features) The adjacency matrix of variables, where ``n_features`` is the number of observed variables. """ return self._adjacency_matrix