728x90
#변수 선택법
def processSubset(x,y, feature_set):
model = sm.OLS(y,x[list(feature_set)]) #modeling
regr = model.fit() #모델학습
AIC = regr.aic #모델의 AIC
return {"model" : regr , "AIC" : AIC}
print(processSubset(x=trainX,y=trainY,feature_set=featureColums[0:5]))
#모든 조합을 다 조합해서 좋은 모델을 반환시키는 알고리즘
import time
import itertools
def getBest(x,y,k):
tic = time.time() #시작시간
results = [] #결과저장공간
for combo in itertools.combinations(x.columns.difference(['const']),k):
combo=(list(combo)+['const'])
#각 변수조합을 고려한 경우의 수
results.append(processSubset(x,y,feature_set=combo))#모델링된 것들을 저장
models=pd.DataFrame(results) #데이터 프레임으로 변환
#가장 낮은 AIC를 가지는 모델 선택 및 저장
bestModel = models.loc[models['AIC'].argmin()] #index
toc = time.time() #종료시간
print("Processed",models.shape[0],"models on",k,"predictors in",(toc-tic),
"seconds.")
return bestModel
#print(getBest(x=trainX,y=trainY,k=2))
#변수 선택에 따른 학습시간과 저장 K 반복
models = pd.DataFrame(columns=["AIC","model"])
tic = time.time()
for i in range(1,4):
models.loc[i] = getBest(trainX,trainY,i)
#toc = time.time()
#print("Total elapsed time : ",(toc-tic),"seconds")
#print(models)
#전진 선택법(Step=1)
def forward(x,y,predictors):
remainingPredictors = [p for p in x.columns.difference(['const'])
if p not in predictors]
tic=time.time()
results=[]
for p in remainingPredictors:
results.append(processSubset(x=x,y=y,feature_set=predictors+[p]+['const']))
#데이터프레임으로 변환
models = pd.DataFrame(results)
#AIC가 가장 낮은 것을 선택
bestModel = models.loc[models['AIC'].argmin()] #index
toc = time.time()
print("Processed ", models.shape[0],"models on", len(predictors)+1,
"predictors in",(toc-tic))
print("Selected predictors:",bestModel['model'].model.exog_names,
'AIC : ',bestModel[0])
return bestModel
#전진선택법 모델
def forward_model(x,y):
fModels = pd.DataFrame(columns=["AIC","model"])
tic = time.time()
#미리 정의된 데이터 변수
predictors = []
#변수1~10개 : 0~9 -> 1~10
for i in range(1,len(x.columns.difference(['const']))+1):
forwardResult= forward(x,y,predictors)
if i > 1:
if forwardResult['AIC'] > fmodelBefore:
break
fModels.loc[i] = forwardResult
predictors = fModels.loc[i]["model"].model.exog_names
fmodelBefore = fModels.loc[i]["AIC"]
predictors = [k for k in predictors if k != 'const']
toc = time.time()
print("Total elapesed time : ", (toc - tic), "seconds.")
return (fModels['model'][len(fModels['model'])])
forwordBestModel=forward_model(trainX,trainY)
print(forwordBestModel.summary())
#후진제거법
def backward(x,y,predictors):
tic = time.time()
results=[]
#데이터 변수들이 미리정의된 predictors 조합확인
for combo in itertools.combinations(predictors, len(predictors)-1):
results.append(processSubset(x,y,list(combo)+['const']))
models = pd.DataFrame(results)
#가장 낮은 AIC를 가진 모델을 선택
bestModel = models.loc[models['AIC'].argmin()]
toc = time.time()
print("Processed",models.shape[0],"models on",len(predictors)-1,
"predictors in",(toc - tic))
print("Selected predictors :",bestModel['model'].model.exog_names,
' AIC:',bestModel[0])
return bestModel
def backword_model(x,y):
BModels = pd.DataFrame(columns=["AIC","model"])
tic = time.time()
#미리 정의된 데이터 변수
predictors = x.columns.difference(['const'])
BmodelBefore = processSubset(x,y,predictors)['AIC']
while(len(predictors)>1):
backwardResult=backward(trainX,trainY,predictors)
if backwardResult['AIC'] > BmodelBefore:
break
BModels.loc[len(predictors)-1] = backwardResult
predictors = BModels.loc[len(predictors)-1]["model"].model.exog_names
BmodelBefore = backwardResult["AIC"]
predictors = [ k for k in predictors if k != 'const']
toc = time.time()
print("Total elapsed time :",(toc - tic), "seconds.")
return (BModels["model"].dropna().iloc[0])
backwardBestModel = backword_model(trainX,trainY)
def Stepwise_model(x,y):
stepModels = pd.DataFrame(columns=["AIC","model"])
tic = time.time()
predictors = []
SmodelBefore = processSubset(x,y,predictors+['const'])['AIC']
#변수 1~10개 : 0~9 -> 1~10
for i in range(1, len(x.columns.difference(['const']))+1):
forwardResult = forward(x,y,predictors)
print("forward")
stepModels.loc[i] = forwardResult
predictors = stepModels.loc[i]["model"].model.exog_names
predictors = [k for k in predictors if k != 'const']
backwordResult = backward(x,y,predictors)
if backwordResult['AIC'] < forwardResult['AIC']:
stepModels.loc[i] = backwordResult
predictors=stepModels.loc[i]["model"].model.exog_names
smodelBefore=stepModels.loc[i]["AIC"]
predictors=[k for k in predictors if k != 'const']
print('backward')
if stepModels.loc[i]["AIC"] > SmodelBefore:
break
else:
smodelBefore = stepModels.loc[i]["AIC"]
toc=time.time()
print("Total elapsed time : ", (toc - tic), "seconds")
return (stepModels['model'][len(stepModels['model'])])
stepwiseBestModel = Stepwise_model(trainX,trainY)
cs |
반응형
'Machine learning' 카테고리의 다른 글
[기계학습]PCA (Principal Conponents Analysis) 주성분 분석 (0) | 2020.07.02 |
---|---|
[기계학습]회귀계수 축소법 ( Ridge regression, Ridge 회귀) (0) | 2020.06.25 |
[기계학습]. 다중선형회귀(Multiple Linear Regression)실습 Python code -예제 (0) | 2020.06.17 |
[기계학습]로지스틱 회귀분석 ( logit , odds , sigmoid 함수 ) (0) | 2020.06.16 |
[기계학습]다항 회귀 분석 ( 비선형 회귀 분석 ) (0) | 2020.06.16 |