Data Visualization Techniques with Diamonds Dataset in R
Posted on Jan 7, 2025 in Computer Engineering
library(tidyverse)
Visualization of Distributions
Diamonds Dataset
str(diamonds)
summary(diamonds)
ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))
diamonds %>% count(cut)
ggplot(data = diamonds) + geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
diamonds %>% count(cut_width(carat, 0.5))
smaller <- diamonds %>% filter(carat < 3)
ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.1)
ggplot(data = smaller, mapping = aes(x = carat, color = cut)) + geom_freqpoly(binwidth = 0.1)
Typical Values
ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.01)
Outliers
ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5)
Zoom of the Previous Plot
ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5) + coord_cartesian(ylim = c(0, 50))
unusual <- diamonds %>% filter(y < 3 | y > 20) %>% arrange(y)
unusual
Missing Values
diamonds2 <- diamonds %>% filter(between(y, 3, 20))
diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point()
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point(na.rm = TRUE)
Covariation: Behavior Between Variables
Continuous Variables
ggplot(data = diamonds, mapping = aes(x = price)) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot()
Qualitative Variables
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()
ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + coord_flip()
Two Qualitative Variables
ggplot(data = diamonds) + geom_count(mapping = aes(x = cut, y = color))
table(diamonds$color)
table(diamonds$color, diamonds$cut)
diamonds %>% count(color, cut) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = n))
Two Continuous Variables
ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))
ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 10)))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 20)))
Patterns and Models
ggplot(data = faithful) + geom_point(mapping = aes(x = eruptions, y = waiting))
Models
library(modelr)
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>% add_residuals(mod) %>% mutate(resid = exp(resid))
ggplot(data = diamonds2) + geom_point(mapping = aes(x = carat, y = resid))
ggplot(data = diamonds2) + geom_point(mapping = aes(x = cut, y = resid))
ggplot(data = diamonds2) + geom_boxplot(mapping = aes(x = cut, y = resid))