import pandas as pd
import numpy as np
x_train = pd.read_csv('data/X_train.csv', encoding='cp949')
y_train = pd.read_csv('data/y_train.csv', encoding='cp949')
x_test = pd.read_csv('data/X_test.csv', encoding='cp949')
x_test_id = x_test.loc[:,'cust_id']
x_test_id = x_test.iloc[:,0] # 어차피 같은거
#x_test_id2 = x_test['cust_id']
x_train = x_train.iloc[:, 1:] # cust_id 빼고 x_train으로 만듦
y_train = y_train.iloc[:, 1] # cust_id 빼고 gender만 남김
x_test = x_test.iloc[:, 1:] # cust_id 빼고x_test로 만듦
x_train['환불금액'].fillna(0, inplace=True)
x_test['환불금액'].fillna(0, inplace=True)
print(x_train.describe())
x_train[x_train['총구매액'] < 0] = 0
print(x_train.describe())
import sklearn.preprocessing
x_test['총구매액'] = sklearn.preprocessing.maxabs_scale(x_test['총구매액'])
x_train['총구매액'] = sklearn.preprocessing.maxabs_scale(x_train['총구매액'])
x_test['최대구매액'] = sklearn.preprocessing.maxabs_scale(x_test['최대구매액'])
x_train['최대구매액'] = sklearn.preprocessing.maxabs_scale(x_train['최대구매액'])
x_test['환불금액'] = sklearn.preprocessing.maxabs_scale(x_test['환불금액'])
x_train['환불금액'] = sklearn.preprocessing.maxabs_scale(x_train['환불금액'])
x_test['내점일수'] = sklearn.preprocessing.maxabs_scale(x_test['내점일수'])
x_train['내점일수'] = sklearn.preprocessing.maxabs_scale(x_train['내점일수'])
x_train_enc = pd.get_dummies(x_train)
x_test_enc = pd.get_dummies(x_test)
lack_cols = set(x_train_enc.columns) - set(x_test_enc.columns)
remain_cols = set(x_test_enc.columns) - set(x_train_enc.columns)
for col in lack_cols:
x_test_enc[col] = 0 #컬럼 만들어 줌
for col in remain_cols:
x_test_enc.drop(col, axis = 1) # 컬럼 삭제
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x_train_enc, y_train, test_size=0.2, random_state=1)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
models=[]
models.append(('clf', Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear'))])))
models.append(('lr', Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])))
models.append(('lda', Pipeline([('scaler', StandardScaler()), ('lda', LinearDiscriminantAnalysis())])))
models.append(('knn', Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])))
models.append(('gnb', Pipeline([('scaler', StandardScaler()), ('gnb', GaussianNB())])))
models.append(('svm', Pipeline([('scaler', StandardScaler()), ('svm', SVC(gamma='auto'))])))
models.append(('ada', Pipeline([('ada', AdaBoostClassifier())])))
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
for name, model in models:
cv_results = cross_val_score(model, train_x, train_y, cv=cv, scoring='roc_auc')
print(name+":"+str(cv_results.mean())+str(cv_results.std()))
#print()
#print(cv_results.std())
best_model = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(solver='liblinear'))])
best_model.fit(train_x, train_y)
best_predict = best_model.predict_proba(test_x)
from sklearn.metrics import roc_auc_score
best_score = roc_auc_score(test_y, best_predict[:,1])
print(best_score)
best_model.fit(x_train_enc, y_train)
submit_predict = best_model.predict_proba(x_test_enc)
print(submit_predict)
final = pd.DataFrame({'cust_id':x_test_id, 'gender':submit_predict[:,1]})
final.to_csv('1234.csv', index=False)
참고 : https://deepcell.kr/bbs/board.php?bo_table=bigbungi&wr_id=23
'Programmer's' 카테고리의 다른 글
visual studio code text/jsx 하이라이트 (0) | 2023.07.08 |
---|---|
visual studio code(비쥬얼 스튜디오 코드) 영어로 복원 (0) | 2023.07.08 |
빅데이터 분석기사 실기 단답형 (0) | 2021.06.18 |
코인 피라미딩 코딩 (0) | 2021.04.29 |
빅데이터 분석기사 요약 - 4과목. 빅데이터 결과 해석 (0) | 2021.04.16 |