서울시 공공데이터 확인
시각화툴 matplot
https://datascienceschool.net/view-notebook/d0b1637803754bb083b5722c9f2209d0/
데이터 프레임 만들어보기 (python)
https://doorbw.tistory.com/172
https://www.youtube.com/watch?v=vcjoWVAwFBc
서울시 인구추이 데이터셋
허민석님의 Keggle 타이타닉 과제 따라해보기
https://www.kaggle.com/c/titanic#evaluation
https://github.com/minsuk-heo/kaggle-titanic/blob/master/titanic-solution.ipynb
Sung Kim 교수님의 ML강좌
선형회귀 관련 -> https://www.youtube.com/watch?v=mQGwjrStQgg&index=5&list=PLlMkM4tgfjnLSOjrEJN31gZATbcj_MpUm
선형회귀 관련 colab.
https://colab.research.google.com/drive/1RPh7QM4oBoYUEUHFMPS2GxLoKZVIJ8L3
https://colab.research.google.com/drive/1_4QHwUnvaS5EcBvWkHbiLZvyHvePKIEu
선형회귀 ML코드
https://writeren.tistory.com/27
from google.colab import files
uploaded = files.upload()
import pandas as pd
test=pd.read_csv('seoul_population_eng.csv')
test.head()
----------------------------------------
index_num = len(test.index)
x_data = []
for i in range(index_num):
year = test.iloc[i,0]
x_data.append(year)
print(x_data)
y_data = []
for i in range(index_num):
population = test.iloc[i,2]
int_population = population.replace(",","")
y_data.append(int_population)
# 문자열을 다시 숫자로 변환
y_data = [int (i) for i in y_data]
print(y_data)
----------------------------
import matplotlib.pyplot as plt
# plt.plot(x_data, y_data,'go')
plt.plot(x_data, y_data,'rs--')
plt.ylim([9600000,11000000])
plt.xlim([1985,2018])
plt.xlabel('x')
plt.ylabel('y')
plt.legend(['ml code sample'])
plt.show()
import tensorflow as tf
xData = [1,2,3,4,5,6,7]
yData = [100,2500,3000,4000,5000,6000,7000]
W= tf.Variable(tf.random_uniform([1],-100,100))
b= tf.Variable(tf.random_uniform([1],-100,100))
X= tf.placeholder(tf.float32)
Y= tf.placeholder(tf.float32)
H=W*X+b
cost = tf.reduce_mean(tf.square(H-Y))
import tensorflow as tf
xData = [1,2,3,4,5,6,7]
yData = [100,2500,3000,4000,5000,6000,7000]
W= tf.Variable(tf.random_uniform([1],-100,100))
b= tf.Variable(tf.random_uniform([1],-100,100))
X= tf.placeholder(tf.float32)
Y= tf.placeholder(tf.float32)
H=W*X+b
cost = tf.reduce_mean(tf.square(H-Y))
Download
files.download('filename')
List directory
files.os.listdir()
"""
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
print('User uploaded file "{name}" with length {length} bytes'.format(
name=fn, length=len(uploaded[fn])))
from google.colab import files
uploaded = files.upload()
import pandas as pd
test=pd.read_csv('test.csv')
test.head()
import pandas as pd
train=pd.read_csv('train.csv')
train.head()
train.shape
test.shape
train.info()
train.isnull().sum()
test.isnull().sum()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() #디폴트로 seaborn 사용설정
def bar_chart(feature):
survived = train[train['Survived']==1][feature].value_counts()
dead = train[train['Survived']==0][feature].value_counts()
df = pd.DataFrame([survived,dead])
df.index = ['Survived','Dead']
df.plot(kind='bar',stacked=True,figsize=(10,5))
print(df)
bar_chart('Sex')
bar_chart('Pclass')
bar_chart('SibSp')
bar_chart('Parch')
"""데이터프레임 셀 변경하기.<br>
컬럼을 하나 없애기 위해서는 drop
<br>
셀 내 콘텐츠를 변경하기 위해서는 <br>
for dataset in train_test_data..
"""
Train 데이터 드랍 Name, PassengerId
test.drop('Name', axis=1, inplace=True)
train.drop('Name',axis=1, inplace=True)
train.drop('PassengerId', axis=1, inplace=True)
Train데이터 드랍 PassengerID, Ticket, cabin, Embarked
featuresDrop = ['Ticket', 'Cabin','Embarked']
train.drop(featuresDrop, axis=1, inplace=True)
train.head()
train_test_data = [train, test] #트레인,테스트 데이터 병합
sex_mapping = {"male": 0, "female": 1}
for dataset in train_test_data:
dataset['Sex'] = dataset['Sex'].map(sex_mapping)
train.head(10)
Test 데이터 드랍 Ticket, cabin, Embarked
featuresDrop = ['Ticket', 'Cabin','Embarked']
test.drop(featuresDrop, axis=1, inplace=True)
test.head()
일단 테스트를 위해서 가라데이터. ㅠ
for dataset in train_test_data:
dataset['Fare'] = dataset['Fare'].fillna(10)
test.isnull().sum()
trainAg = train.groupby('Age').mean()
testAg = test['Age'].mean
trainAg = train['Age'].mean()
testAg = test['Age'].mean()
print(trainAg)
averageAge = (trainAg+testAg)/2
print(averageAge)
fill missing age with median age for each title (Mr, Mrs, Miss, Others)# fill m
dataframe 내 빈값 채우기
for dataset in train_test_data:
dataset['Age'] = dataset['Age'].fillna(averageAge)
print(dataset)
print(train_test_data)
train_data = train.drop('Survived', axis=1)
target = train['Survived']
train_data.shape, target.shape
train_data.head()
test.head()
train_data.isnull.sum()
train.isnull().sum()
test.isnull().sum()
"""# Modeling
SVM으로 모델링해보자.
from sklearn.svm import SVC
"""
Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import numpy as np
train.info()
"""# Cross Validation (K-fold)
split을 10으로 나누어 k-fold 실행해보자
"""
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)
"""## SVM"""
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
"""##kNN"""
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
"""##DecisionTree"""
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
"""#TESTING!!"""
clf = DecisionTreeClassifier()
clf.fit(train_data, target)
test_data = test.drop("PassengerId", axis=1).copy()
prediction = clf.predict(test_data)
submission = pd.DataFrame({
"PassengerId": test["PassengerId"],
"Survived": prediction
})
submission.to_csv('submission.csv', index=False)
files.download('submission.csv')
submissionsubmissi = pd.read_csv('submission.csv')
submission.head(500)
test.drop('Name', axis=1, inplace=True)
train.drop('Name',axis=1, inplace=True)
test.head()
train.head(5)
bar_chart('Sex')
티켓, SibSp, Parch Cabin, Embarked drop
featuresDrop = ['Cabin','']
train = train.drop(featuresDrop, axis=1, inplace=True)
test = test.drop(featuresDrop, axis=1, inplace=True)
print(train_test_data)
train.isnull().sum()
test.isnull().sum()
fill missing Embarked info.
for dataset in train_test_data: