블랙프라이데이 데이터 EDA

Kaggle/My kaggle

블랙프라이데이 데이터 EDA

토토모에요 2022. 3. 18. 18:50

728x90

사용언어는 R입니다. 참고바랍니다!

#라이브러리
library(dplyr)
library(ggplot2)


#데이터 불러오기
train<-read.csv("train_3_18.csv")
str(train)

#결측값 조사
#Gender 결측치 없음
table(is.na(train$Gender))
table(train$Gender)


gender_frame <- as.data.frame(prop.table(table(train$Gender)))
ggplot(gender_frame, aes(x=Var1, y = Freq, fill=Var1)) + geom_col()+xlab("성별") + ylab("비율")+coord_flip()+ggtitle("데이터 성별 인원 비율")
#남성의 자료가 더 많음을 알 수 있습니다.

#Age 결측치 없음
table(is.na(train$Age))
table(train$Age)
Age_frame <- as.data.frame(prop.table(table(train$Age)))
ggplot(Age_frame, aes(x=Var1, y = Freq, fill=Var1)) + geom_col()+xlab("나이대") + ylab("비율")+coord_flip()+ggtitle("데이터 나이대 인원 비율")

train %>% 
    group_by(Gender) %>% 
    count(Age)
#26-35세가 가장 많이 참여했음을 알 수 있습니다.
#반면에 0-17, 55세 이상은 거래를 많이 안했음을 알 수 있습니다.


#Purchase 결측치 없음
table(is.na(train$Purchase))
table(train$Purchase)

#Product_ID 결측치 없음
table(is.na(train$Product_ID))
table(train$Product_ID)

#Occupation 이상치나 결측치 없음
table(is.na(train$Occupation))
table(train$Occupation)
Occupation_frame <- as.data.frame(prop.table(table(train$Occupation)))
ggplot(Occupation_frame, aes(x=Var1, y = Freq, fill=Var1)) + geom_col()+xlab("직업") + ylab("비율")+coord_flip()+ggtitle("데이터 직업별 인원 비율")+coord_polar()

#Marital_Status 결측치 없음
table(is.na(train$Marital_Status))
table(train$Marital_Status)
Marital_Status_frame <- as.data.frame(prop.table(table(train$Marital_Status)))
ggplot(Marital_Status_frame, aes(x=Var1, y = Freq, fill=Var1)) + geom_col()+xlab("결혼여부") + ylab("비율")+coord_flip()+ggtitle("데이터 결혼여부 비율")
#결혼 안한 비율이 더 높은 것을 볼 수있습니다.

#Stay_In_Current_City_Years 도시 체류 기간, 결측치 없음
table(is.na(train$Stay_In_Current_City_Years)) 
table(train$Stay_In_Current_City_Years)
Stay_In_Current_City_Years_frame <- as.data.frame(prop.table(table(train$Stay_In_Current_City_Years)))
ggplot(Stay_In_Current_City_Years_frame, aes(x=Var1, y = Freq, fill=Var1)) + geom_col()+xlab("도시명") + ylab("비율")+coord_flip()+ggtitle("도시 체류 기간")
#블랙프라이데이를 이용한 고객들은 1년정도 체류한 사람들이 가장 많았습니다.

#City_Category
table(is.na(train$City_Category))
table(train$City_Category)
City_Category_frame <- as.data.frame(prop.table(table(train$City_Category)))
ggplot(City_Category_frame, aes(x=Var1, y = Freq, fill=Var1)) + geom_col()+xlab("도시명") + ylab("비율")+coord_flip()+ggtitle("데이터 도시별 인원 비율")
#B지역 > A지역 > C지역 순으로 이용했음을 알 수 있습니다.

#종합하면 미혼 남성의 20대 후반~30대 초반이 블랙프라이데이를 많이 이용했다고 볼 수 있다.


#타겟 고객층 : 미혼 26-35세 남성
#타겟 고객층이 가장 많이 사는 상품은 무엇일까?
#타겟 고객층이 주로 어느 지역에 사는지, 어느 지역에 어떤 상품을 빠르게 배송 or 판매 할지
target<-train %>% 
    select(Gender,Age,City_Category,Marital_Status, Product_ID, Purchase ) %>% 
    filter(Gender=="M" & Age=="26-35" & Marital_Status=="0")

target_Product_ID<-target %>% 
    group_by(Product_ID) %>% 
    count(Product_ID) %>% 
    arrange(desc(n)) %>% 
    head(5)


target_Product_ID<-as.data.frame(target_Product_ID)
target_Product_ID
ggplot(target_Product_ID, aes(x=Product_ID, y = n, fill=Product_ID)) + geom_col()+xlab("상품") + ylab("판매량")+coord_flip()+ggtitle("상품별 판매 빈도")

#미혼 26-35세 남성의 구매 상품 상위 5개 
#P00265242 > P00110742 > P00025442 > P00057642 > P00184942임을 알 수 있었습니다.


#여기서 미혼 26-35세 남성은 블랙프라이데이를 얼마나 많이 이용할까?
target_diversity<-train %>% 
    select(User_ID, Gender,Age,City_Category,Marital_Status, Product_ID ) %>% 
    filter(Gender=="M" & Age=="26-35" & Marital_Status=="0") %>% 
    group_by(User_ID) %>% 
    count(User_ID) 

mean(target_diversity$n) #한 사람 평균 112번 정도 이용했음을 알 수 있습니다.


#타겟 고객층이 주로 어느 지역에 사는지, 어느 지역에 어떤 상품을 빠르게 배송 or 판매 할지
target_city<-train %>% 
    select(User_ID, Gender,Age,City_Category,Marital_Status, Product_ID ) %>% 
    filter(Gender=="M" & Age=="26-35" & Marital_Status=="0")  
    

target_city_frame<-as.data.frame(table(target_city$City_Category))
ggplot(target_city_frame, aes(x=Var1, y = Freq, fill=Var1)) + geom_col()+xlab("지역") + ylab("빈도")+coord_flip()+ggtitle("블랙프라이데이를 이용하는 미혼 26-35세 남성 지역별 빈도")
#B지역에 많이 거주함을 알 수 있습니다.

#종합하면 B지역에 P00265242 , P00110742 , P00025442 , P00057642 , P00184942를 배치하거나 창고에 보관하면 효율적일것입니다.


#이번에는 판매실적이 낮은 타겟층의 구매 상품, 지역을 파악해보겠습니다.
train_target_2<-train %>% 
    group_by(Gender) %>% 
    count(Age)

ggplot(train_target_2[train_target_2$Gender=="F",], aes(x=Age, y = n, fill=Age)) + geom_col()
#17이하, 55세 여성의 블랙프라이데이 이용률이 낮았음을 알 수 있습니다.

#17세 이하 여성 구매 상품
target_2<-train %>% 
    select(Gender,Age,City_Category,Marital_Status, Product_ID, Purchase ) %>% 
    filter(Gender=="F" & Age=="0-17" & Marital_Status=="0")

target_2_Product_ID<-target_2 %>% 
    group_by(Product_ID) %>% 
    count(Product_ID) %>% 
    arrange(desc(n)) %>% 
    head(5)


target_2_Product_ID<-as.data.frame(target_2_Product_ID)
target_2_Product_ID
ggplot(target_2_Product_ID, aes(x=Product_ID, y = n, fill=Product_ID)) + geom_col()+xlab("상품") + ylab("판매량")+coord_flip()+ggtitle("상품별 판매 빈도")

#17세 이하 여성은    
#P00003442 > P00085942 > P00145042 > P00000142 > P00102642을 주로 구매했습니다.

target_3<-train %>% 
    select(Gender,Age,City_Category,Marital_Status, Product_ID, Purchase ) %>% 
    filter(Gender=="F" & Age=="55+" & Marital_Status=="0")

target_3_Product_ID<-target_3 %>% 
    group_by(Product_ID) %>% 
    count(Product_ID) %>% 
    arrange(desc(n)) %>% 
    head(5)


target_3_Product_ID<-as.data.frame(target_3_Product_ID)
target_3_Product_ID
ggplot(target_3_Product_ID, aes(x=Product_ID, y = n, fill=Product_ID)) + geom_col()+xlab("상품") + ylab("판매량")+coord_flip()+ggtitle("상품별 판매 빈도")

#55세 이상 여성은
#P00086042 > P00080342 > P00102642 > P00025442 > P00034742을 주로 구매했습니다다. 하지만 다른 연령대에 비해 구매 횟수가 적었습니다.


#성별로 많이 구매한 상품 파악
train %>% 
    select(Gender,Age,City_Category,Marital_Status, Product_ID, Purchase ) %>% 
    filter(Gender=="F") %>% 
    count(Product_ID) %>% 
    arrange(desc(n)) %>% 
    head(1)

#여성은 P00265242를 508번으로 가장 많이 구매했습니다.

train %>% 
    select(Gender,Age,City_Category,Marital_Status, Product_ID, Purchase ) %>% 
    filter(Gender=="M") %>% 
    count(Product_ID) %>% 
    arrange(desc(n)) %>% 
    head(1)
#남성은 P00265242를 1372번으로 가장 많이 구매했습니다.



#나이대별로 많이 구매한 상품 파악
train %>% 
    select(Age,Product_ID) %>% 
    group_by(Product_ID) %>% 
    count(Age) %>% 
    arrange(desc(n)) 


#2번     
#연관 규칙 문제는 아래와 같이 코드를 작성한 후 수행하세요

dataset <- read.csv("train_3_18.csv")

library(tidyverse)
library(scales)
library(arules)

customers_products <- dataset %>%
    select(User_ID, Product_ID) %>% # Selecting the columns we will need
    group_by(User_ID) %>% # Grouping by "User_ID" 
    arrange(User_ID) %>% # Arranging by "User_ID" 
    mutate(id = row_number()) %>% 
    spread(User_ID, Product_ID) %>% 
    t()

write.csv(customers_products, file = 'customers_products.csv')
customersProducts <- read.transactions('customers_products.csv', sep = ',', rm.duplicates = TRUE)       
       
customersProducts       
summary(customersProducts)
#5893 rows
#11575 columns
print(5893*11575)
#most frequent items:
# P00265242 P00025442 P00110742 P00112142 P00057642   (Other) 
# 1880      1615      1612      1562      1470        549873 


inspect(customersProducts)
#각 상품 비율 파악
itemFrequencyPlot(customersProducts, topN=10)

#상품수가 많아 support=0.01로 지정
CpRules<-apriori(customersProducts,parameter=list(support=0.01, confidence=0.2, minlen=2))
CpRules

CpRules_head<-head(inspect(sort(CpRules ,by="lift")),50)
#P00179042을 구매한 사람은 P00179242을 구매할 확률이 높다.
#P00179142을 구매한 사람은 P00179242을 구매할 확률이 높다.
#P00271442을 구매한 사람은 P00271542을 구매할 확률이 높다.

#시각화
ggplot(CpRules_head, aes(x=support, y=confidence))+geom_col()

728x90

현재글블랙프라이데이 데이터 EDA

토토모의 분석일지

블랙프라이데이 데이터 EDA

'Kaggle/My kaggle'의 다른글

티스토리툴바