Parametric EM (missing data)¶
In [1]:
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
import os
#the bases will be saved in "out/*.csv"
EMnomissing="out/EM_nomissing.csv"
EMmissing="out/EM_missing.csv"
Generating data with missing values (at random)¶
In [2]:
src=gum.fastBN("A->B<-C->D->E<-B;D->F")
gum.generateSample(src,5000,EMnomissing,random_order=False)
src
Out[2]:
In [3]:
import pandas as pd
import numpy as np
def add_missing(src,dst,proba):
df=pd.read_csv(src)
mask=np.random.choice([True, False], size=df.shape,p=[proba,1-proba])
df.mask(mask).to_csv(dst,na_rep='?',index=False,float_format='%.0f')
gum.generateSample(src,5000,EMnomissing,random_order=False)
add_missing(EMnomissing,EMmissing,proba=0.1)
In [4]:
print("No missing")
with open(EMnomissing,"r") as srcfile:
for _ in range(10):
print(srcfile.readline(),end="")
print("Missing")
with open(EMmissing,"r") as srcfile:
for _ in range(10):
print(srcfile.readline(),end="")
No missing A,B,C,D,E,F 0,1,1,1,0,1 1,1,0,1,0,1 1,1,0,0,0,1 0,1,0,0,0,1 0,1,0,1,0,0 0,1,0,0,1,1 1,1,0,0,0,1 1,1,0,0,0,1 0,1,0,0,0,1 Missing A,B,C,D,E,F 0,1,?,1,?,1 1,1,0,1,0,1 1,1,0,0,0,1 0,1,0,0,0,1 0,1,0,1,0,0 0,1,0,0,1,? 1,1,0,0,0,1 ?,1,0,0,?,1 0,1,0,0,0,1
Learning with missing data¶
In [5]:
learner = gum.BNLearner(EMmissing,src, ["?"])
print(f"Missing values in {EMmissing} : {learner.hasMissingValues()}")
Missing values in out/EM_missing.csv : True
In [6]:
try:
learner.learnParameters(src.dag())
except gum.MissingValueInDatabase:
print("Learning is not possible without EM if there are some missing values.")
Learning is not possible without EM if there are some missing values.
In [7]:
learner.useEM(1e-3)
learner.useSmoothingPrior()
print(learner)
bn=learner.learnParameters(src.dag())
gnb.flow.row(gnb.getInference(src),gnb.getInference(bn),captions=["Source",f"Estimation EM in {learner.nbrIterations()} iteration(s)"])
Filename : out/EM_missing.csv Size : (5000,6) Variables : A[2], B[2], C[2], D[2], E[2], F[2] Induced types : False Missing values : True Algorithm : MIIC Score : BDeu (Not used for constraint-based algorithms) Correction : MDL (Not used for score-based algorithms) Prior : Smoothing Prior weight : 1.000000 EM : True EM epsilon : 0.001000
Learning with smaller error (and no smoothing)¶
In [8]:
learner = gum.BNLearner(EMmissing,src, ["?"])
learner.setVerbosity(True)
learner.useEM(1e-8)
bn2=learner.learnParameters(src.dag())
gnb.flow.row(gnb.getInference(src),gnb.getInference(bn2),captions=["Source",f"Estimation EM in {learner.nbrIterations()} iteration(s)"])
In [9]:
import matplotlib.pyplot as plt
import numpy as np
plt.plot(np.arange(1,1+learner.nbrIterations()),learner.history())
plt.xticks(np.arange(1, 1+learner.nbrIterations(), step=2))
plt.title("Error during EM iterations");
In [ ]: