library(tidyverse)
library(ggplot2)
library(patchwork)
library(plotly)
data <- read_csv("Fraudulent_E-Commerce_Transaction_Data.csv")
cleanData <- function(dataframe, colsToKeep) {
cleanedData <- dataframe %>%
rowid_to_column("ID") %>%
select(colsToKeep) %>%
rename(
Transaction_Amount=`Transaction Amount`,
Transaction_Date=`Transaction Date`,
Payment_Method=`Payment Method`,
Product_Category=`Product Category`,
Customer_Location=`Customer Location`,
Customer_Age=`Customer Age`,
Device_Used=`Device Used`,
Is_Fraudulent=`Is Fraudulent`,
Account_Age_Days=`Account Age Days`,
Transaction_Hour=`Transaction Hour`
)
return(cleanedData)
}
colsToKeep <- c("ID", "Transaction Amount", "Transaction Date", "Payment Method", "Product Category", "Quantity", "Customer Location", "Customer Age", "Device Used", "Is Fraudulent", "Account Age Days", "Transaction Hour")
cleanedData <-cleanData(data, colsToKeep = colsToKeep)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(colsToKeep)
##
## # Now:
## data %>% select(all_of(colsToKeep))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
cleanedData %>%
ggplot(aes(x = Transaction_Date, y = Transaction_Amount)) +
geom_line() +
labs(
title = "Transaction Amount Over Time",
x = "Date",
y = "Transaction Amount (Dollars)"
)
# Convert date column in cleanedData to date type
cleanedData$Transaction_Date <- as.Date(cleanedData$Transaction_Date)
startDate <- as.Date("2024-01-01")
endDate <- as.Date("2024-01-31")
januaryData <- cleanedData %>%
filter(Transaction_Date >= startDate, Transaction_Date <= endDate)
# Plot January data
januaryData %>%
ggplot(aes(x = Transaction_Date, Transaction_Amount)) +
geom_line() +
labs(
title = "Transactions from January 1, 2024 to January 31, 2024",
x = "Date",
y = "Transaction Amount (dollars)"
)
# Filter data for January 26th
j26Data <- januaryData %>%
filter(Transaction_Date == "2024-01-26")
# Calculate proportions of fraudulent transactions for Product_Category
product_proportions <- j26Data %>%
group_by(Product_Category) %>%
summarise(fraudulent_proportion = sum(Is_Fraudulent == 1) / n())
# Calculate proportions of fraudulent transactions for Payment_Method
payment_proportions <- j26Data %>%
group_by(Payment_Method) %>%
summarise(fraudulent_proportion = sum(Is_Fraudulent == 1) / n())
# Create plot for Product_Category
fig1 <- ggplot(product_proportions, aes(x = Product_Category, y = fraudulent_proportion, fill = Product_Category)) +
geom_bar(stat = "identity") +
labs(title = "Relative Frequencies of Fraudulent Transactions by Product Category",
x = "Product Category",
y = "Proportion of Fraudulent Transactions",
fill = "Product Category") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Create plot for Payment_Method
fig2 <- ggplot(payment_proportions, aes(x = Payment_Method, y = fraudulent_proportion, fill = Payment_Method)) +
geom_bar(stat = "identity") +
labs(title = "Relative Frequencies of Fraudulent Transactions by Payment Method",
x = "Payment Method",
y = "Proportion of Fraudulent Transactions",
fill = "Payment Method") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
fig1 ##Figure 3
fig2 ## Figure 4
health_beauty_credit <- j26Data %>%
filter(Product_Category == 'health & beauty' & Payment_Method == 'debit card')
num_transactions <- nrow(health_beauty_credit)
print(num_transactions)
## [1] 826
num_fraudulent_transactions <- sum(health_beauty_credit$Is_Fraudulent == 1)
percent <- (num_fraudulent_transactions / num_transactions) * 100
print(percent)
## [1] 4.116223
health_beauty_debit <- cleanedData %>%
filter(Product_Category == 'health & beauty' & Payment_Method == 'debit card')
num_transactions <- nrow(health_beauty_debit)
print(num_transactions)
## [1] 73475
num_fraudulent_transactions <- sum(health_beauty_debit$Is_Fraudulent == 1)
percent <- (num_fraudulent_transactions / num_transactions) * 100
print(percent)
## [1] 5.007145
debitCardData <- cleanedData %>%
filter(Payment_Method == 'debit card') %>%
summarise(percent = mean(Is_Fraudulent == 1, na.rm = TRUE) * 100)
debitCardData
## # A tibble: 1 × 1
## percent
## <dbl>
## 1 5.02
credit <- cleanedData %>%
filter(Payment_Method == 'credit card')
num_transactions <- nrow(credit)
print(num_transactions)
## [1] 368429
num_fraudulent_transactions <- sum(credit$Is_Fraudulent == 1)
percent <- (num_fraudulent_transactions / num_transactions) * 100
print(percent)
## [1] 4.970564
ageData <- cleanedData %>%
select(Customer_Age, Is_Fraudulent)
summary_data <- ageData %>%
group_by(Customer_Age, Is_Fraudulent) %>%
summarise(count = n()) %>%
filter(!is.na(Customer_Age))
## `summarise()` has grouped output by 'Customer_Age'. You can override using the
## `.groups` argument.
# Plot as a bar plot
plot_ly(summary_data, x = ~Customer_Age, y = ~count, color = ~factor(Is_Fraudulent), type = "bar") %>%
layout(title = "Fraud by Age", xaxis = list(title = "Customer Age"), yaxis = list(title = "Count"), barmode = "group")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
middleAgeDate <- cleanedData %>%
filter(Customer_Age >= 25 & Customer_Age <= 45 & Payment_Method == 'debit card')
num_fraudulent_transactions <- sum(middleAgeDate$Is_Fraudulent == 1)
percent <- (num_fraudulent_transactions / num_transactions) * 100
print(percent)
## [1] 3.539624