User Experience Design :: User Experience Design


서울시 공공데이터 확인

https://data.seoul.go.kr/


시각화툴 matplot

https://datascienceschool.net/view-notebook/d0b1637803754bb083b5722c9f2209d0/


데이터 프레임 만들어보기 (python)

https://doorbw.tistory.com/172

https://www.youtube.com/watch?v=vcjoWVAwFBc


서울시 인구추이 데이터셋

seoul_population_eng01.csv

seoul_population_eng01.prn



허민석님의 Keggle 타이타닉 과제 따라해보기

https://www.kaggle.com/c/titanic#evaluation

https://github.com/minsuk-heo/kaggle-titanic/blob/master/titanic-solution.ipynb


Sung Kim 교수님의 ML강좌

선형회귀 관련 -> https://www.youtube.com/watch?v=mQGwjrStQgg&index=5&list=PLlMkM4tgfjnLSOjrEJN31gZATbcj_MpUm


선형회귀 관련 colab.

https://colab.research.google.com/drive/1RPh7QM4oBoYUEUHFMPS2GxLoKZVIJ8L3

https://colab.research.google.com/drive/1_4QHwUnvaS5EcBvWkHbiLZvyHvePKIEu


선형회귀 ML코드

https://writeren.tistory.com/27



from google.colab import files

uploaded = files.upload()


import pandas as pd

test=pd.read_csv('seoul_population_eng.csv')

test.head()

----------------------------------------


index_num = len(test.index)

x_data = []

for i in range(index_num):

  year = test.iloc[i,0]

  x_data.append(year)

print(x_data)

  

y_data = []

for i in range(index_num):

  population = test.iloc[i,2]

  int_population = population.replace(",","")

  y_data.append(int_population)

  


# 문자열을 다시 숫자로 변환

y_data = [int (i) for i in y_data]

print(y_data)


----------------------------


import matplotlib.pyplot as plt


# plt.plot(x_data, y_data,'go')

plt.plot(x_data, y_data,'rs--')


plt.ylim([9600000,11000000])


plt.xlim([1985,2018])


plt.xlabel('x')


plt.ylabel('y')


plt.legend(['ml code sample'])


plt.show()


import tensorflow as tf

xData = [1,2,3,4,5,6,7]

yData = [100,2500,3000,4000,5000,6000,7000]


W= tf.Variable(tf.random_uniform([1],-100,100))

b= tf.Variable(tf.random_uniform([1],-100,100))


X= tf.placeholder(tf.float32)

Y= tf.placeholder(tf.float32)


H=W*X+b

cost = tf.reduce_mean(tf.square(H-Y))




import tensorflow as tf

xData = [1,2,3,4,5,6,7]

yData = [100,2500,3000,4000,5000,6000,7000]


W= tf.Variable(tf.random_uniform([1],-100,100))

b= tf.Variable(tf.random_uniform([1],-100,100))


X= tf.placeholder(tf.float32)

Y= tf.placeholder(tf.float32)


H=W*X+b

cost = tf.reduce_mean(tf.square(H-Y))





Upload

from google.colab import files
files.upload()

------------------------------------------------------------------------------

Download


files.download('filename')

List directory


files.os.listdir()

"""


from google.colab import files

uploaded = files.upload()


for fn in uploaded.keys():

print('User uploaded file "{name}" with length {length} bytes'.format(

name=fn, length=len(uploaded[fn])))


from google.colab import files

uploaded = files.upload()


import pandas as pd

test=pd.read_csv('test.csv')

test.head()


import pandas as pd

train=pd.read_csv('train.csv')

train.head()


train.shape


test.shape


train.info()


train.isnull().sum()


test.isnull().sum()


import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns

sns.set() #디폴트로 seaborn 사용설정


def bar_chart(feature):

  survived = train[train['Survived']==1][feature].value_counts()

  dead = train[train['Survived']==0][feature].value_counts()

  df = pd.DataFrame([survived,dead])

  df.index = ['Survived','Dead']

  df.plot(kind='bar',stacked=True,figsize=(10,5))

print(df)


bar_chart('Sex')


bar_chart('Pclass')


bar_chart('SibSp')


bar_chart('Parch')


"""데이터프레임 셀 변경하기.<br>

컬럼을 하나 없애기 위해서는 drop

<br>

셀 내 콘텐츠를 변경하기 위해서는 <br>

for dataset in train_test_data..

"""


Train 데이터 드랍 Name, PassengerId


test.drop('Name', axis=1, inplace=True)

train.drop('Name',axis=1, inplace=True)

train.drop('PassengerId', axis=1, inplace=True)


Train데이터 드랍 PassengerID, Ticket, cabin, Embarked


featuresDrop = ['Ticket', 'Cabin','Embarked']

train.drop(featuresDrop, axis=1, inplace=True)


train.head()


train_test_data = [train, test] #트레인,테스트 데이터 병합


sex_mapping = {"male": 0, "female": 1}

for dataset in train_test_data:

    dataset['Sex'] = dataset['Sex'].map(sex_mapping)


train.head(10)


Test 데이터 드랍 Ticket, cabin, Embarked


featuresDrop = ['Ticket', 'Cabin','Embarked']

test.drop(featuresDrop, axis=1, inplace=True)


test.head()


일단 테스트를 위해서 가라데이터. ㅠ


for dataset in train_test_data:

    dataset['Fare'] = dataset['Fare'].fillna(10)


test.isnull().sum()


trainAg = train.groupby('Age').mean()

testAg = test['Age'].mean

trainAg = train['Age'].mean()

testAg = test['Age'].mean()

print(trainAg)


averageAge = (trainAg+testAg)/2

print(averageAge)


fill missing age with median age for each title (Mr, Mrs, Miss, Others)# fill m

dataframe 내 빈값 채우기


for dataset in train_test_data:

  dataset['Age'] = dataset['Age'].fillna(averageAge)

  print(dataset)


print(train_test_data)


train_data = train.drop('Survived', axis=1)

target = train['Survived']

train_data.shape, target.shape


train_data.head()


test.head()


train_data.isnull.sum()

train.isnull().sum()


test.isnull().sum()


"""# Modeling


SVM으로 모델링해보자.

from sklearn.svm import SVC

"""


Importing Classifier Modules

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC


import numpy as np


train.info()


"""# Cross Validation (K-fold)

split을 10으로 나누어 k-fold 실행해보자

"""


from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

k_fold = KFold(n_splits=10, shuffle=True, random_state=0)


"""## SVM"""


clf = SVC()

scoring = 'accuracy'

score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)

print(score)


"""##kNN"""


clf = KNeighborsClassifier(n_neighbors = 13)

scoring = 'accuracy'

score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)

print(score)


"""##DecisionTree"""


clf = DecisionTreeClassifier()

scoring = 'accuracy'

score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)

print(score)


"""#TESTING!!"""


clf = DecisionTreeClassifier()

clf.fit(train_data, target)


test_data = test.drop("PassengerId", axis=1).copy()

prediction = clf.predict(test_data)


submission = pd.DataFrame({

        "PassengerId": test["PassengerId"],

        "Survived": prediction

    })


submission.to_csv('submission.csv', index=False)

files.download('submission.csv')


submissionsubmissi = pd.read_csv('submission.csv')

submission.head(500)


test.drop('Name', axis=1, inplace=True)

train.drop('Name',axis=1, inplace=True)

test.head()

train.head(5)


bar_chart('Sex')


티켓, SibSp, Parch Cabin, Embarked drop

featuresDrop = ['Cabin','']

train = train.drop(featuresDrop, axis=1, inplace=True)

test = test.drop(featuresDrop, axis=1, inplace=True)


print(train_test_data)


train.isnull().sum()

test.isnull().sum()


fill missing Embarked info.


for dataset in train_test_data:


댓글을 달아 주세요

Technorati Profile