Data Visualization Techniques with Diamonds Dataset in R

library(tidyverse)

Visualization of Distributions

Diamonds Dataset

str(diamonds)
summary(diamonds)
  • ggplot(data = diamonds) + geom_bar(mapping = aes(x = cut))
  • diamonds %>% count(cut)
  • ggplot(data = diamonds) + geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
  • diamonds %>% count(cut_width(carat, 0.5))
smaller <- diamonds %>% filter(carat < 3)
  • ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.1)
  • ggplot(data = smaller, mapping = aes(x = carat, color = cut)) + geom_freqpoly(binwidth = 0.1)

Typical Values

  • ggplot(data = smaller, mapping = aes(x = carat)) + geom_histogram(binwidth = 0.01)

Outliers

ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5)

Zoom of the Previous Plot

ggplot(diamonds) + geom_histogram(mapping = aes(x = y), binwidth = 0.5) + coord_cartesian(ylim = c(0, 50))
unusual <- diamonds %>% filter(y < 3 | y > 20) %>% arrange(y)
unusual

Missing Values

  • diamonds2 <- diamonds %>% filter(between(y, 3, 20))
  • diamonds2 <- diamonds %>% mutate(y = ifelse(y < 3 | y > 20, NA, y))
  • ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point()
  • ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + geom_point(na.rm = TRUE)

Covariation: Behavior Between Variables

Continuous Variables

  • ggplot(data = diamonds, mapping = aes(x = price)) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
  • ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
  • ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + geom_boxplot()

Qualitative Variables

  • ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()
  • ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
  • ggplot(data = mpg) + geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + coord_flip()

Two Qualitative Variables

  • ggplot(data = diamonds) + geom_count(mapping = aes(x = cut, y = color))
  • table(diamonds$color)
  • table(diamonds$color, diamonds$cut)
  • diamonds %>% count(color, cut) %>% ggplot(mapping = aes(x = color, y = cut)) + geom_tile(mapping = aes(fill = n))

Two Continuous Variables

  • ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price))
  • ggplot(data = diamonds) + geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
  • ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
  • ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 10)))
  • ggplot(data = smaller, mapping = aes(x = carat, y = price)) + geom_boxplot(mapping = aes(group = cut_number(carat, 20)))

Patterns and Models

ggplot(data = faithful) + geom_point(mapping = aes(x = eruptions, y = waiting))

Models

library(modelr)
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>% add_residuals(mod) %>% mutate(resid = exp(resid))
  • ggplot(data = diamonds2) + geom_point(mapping = aes(x = carat, y = resid))
  • ggplot(data = diamonds2) + geom_point(mapping = aes(x = cut, y = resid))
  • ggplot(data = diamonds2) + geom_boxplot(mapping = aes(x = cut, y = resid))