Data Visualizaiton 3

code
data-visualization
Author

Alexis Kruzicki

Published

February 21, 2024

Code
library(knitr)
library(tidyverse)
library(socviz)
library(ggthemes)
library(ggrepel)
library(ggtext)
library(hrbrthemes)
library(gapminder)


Question 1

The following data is for Question 1:

Code
gapminder <- gapminder::gapminder


Q1a

  • Replicate the following ggplot.
    • Use the color #0072B2 for dots.

Answer:

Code
continents <-c("Africa", "Americas", "Asia", "Europe")

gapminder_1 <- gapminder::gapminder|>
  filter(year==2007) %>% 
  filter(continent %in% continents)

# gapminder_1 <- gapminder_1|>
#   filter(continent==continents)

ggplot (gapminder_1, 
        aes(x= lifeExp, 
            y = reorder(country, lifeExp))) + 
  geom_point(color="#0072B2") + 
  geom_text(aes(label=lifeExp), hjust = -.25) +
  facet_wrap(continent~.,
                           scales = "free_y"                                                                      ) +
  xlim(c(40,110))



Q1b

  • Make a simple comment on the visualization result.

Answer:

Europe has the overall highest life expectency




Question 2

The following data is for Question 2:

Code
n_tweets_long <- read_csv(
  'https://bcdanl.github.io/data/n_tweets_long.csv')
Rows: 24 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): type
dbl (2): year, n

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


Q2a

  • Replicate the following ggplot.

    • The following describes the type values:
      • n_ot_us: Number of US tweets
      • n_ot_wrld: Number of worldwide tweets
      • n_rt_lk_us: Number of US retweets & likes
      • n_rt_lk_wrld: Number of worldwide retweets & likes
    • Use the colors, maroon and #428bca properly.

Answer:

Code
# d <- data.frame(x=1:10, y=1/(10:1))
# ggplot(d, aes(x= year, y=)) + geom_bar(stat="identity")

# library(ggplot2)

n_tweets <- n_tweets_long %>% 
  filter(type == 'n_ot_us' | type == 'n_ot_wrld' ) %>% 
  mutate(type = ifelse(type == 'n_ot_us', "US", "Worldwide"))


n_retweets_lks <- n_tweets_long %>% 
   filter(type == 'n_rt_lk_us' | type == 'n_rt_lk_wrld' ) %>% 
  mutate(type = ifelse(type == 'n_rt_lk_us', "US", "Worldwide"))
  

# Create the ggplot bar graph
ggplot(n_tweets, aes(x = year, y = n)) +
  geom_bar(aes(fill = type), stat = "identity", position = "dodge") +
  geom_line(data = n_retweets_lks, aes(color = type),
            linewidth=3) +
  geom_point(data = n_retweets_lks, size=3) +
  scale_fill_manual(values = c("maroon", "#428bca")) +
  scale_color_manual(values = c("maroon", "#428bca")) +
  scale_x_continuous(breaks = 2012:2017) +
  labs(x = "Year",
       y = "Number of Tweets, Retweets & Likes\n(in thousand)",
       fill="Tweets", color="Retweets and likes") +
  guides(fill = guide_legend(reverse = TRUE,
                             label.position = "bottom",
                             keywidth = 3,
                             nrow = 2,
                             order = 1),
         color = guide_legend(reverse = TRUE,
                             label.position = "bottom",
                             keywidth = 3,
                             nrow = 2,
                             order = 2)) +
  theme_minimal()+ 
  theme(legend.position = "top")



Q2b.

  • Make a simple comment on the visualization result.

As the years increase the number of tweets, reweets and likes increase greatly. With worldwide having a larger increase.




Question 3

The following data set is for Question 3:

Code
electricity <- read_csv(
  'https://bcdanl.github.io/data/electricity-usa-chn.csv')
Rows: 360 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): energy, label, iso3c
dbl (2): year, value

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


Code
electricity %>% 
  count(iso3c)
# A tibble: 2 × 2
  iso3c     n
  <chr> <int>
1 CHN     180
2 USA     180
Code
electricity <- electricity %>% 
  mutate(iso3c = ifelse(iso3c == 'CHN', 
                        "China",
                        "United States"))
Code
electricity %>% 
  count(iso3c)
# A tibble: 2 × 2
  iso3c             n
  <chr>         <int>
1 China           180
2 United States   180

Q3a

  • Replicate the following ggplot.

Answer:

Code
ggplot(data = electricity) +
  geom_line(aes(x = year,
                y = value ,
                color = energy),
            linewidth=3) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 1985) +
facet_wrap(iso3c~.,) +
scale_colour_viridis_d(option = "plasma")+
  theme(legend.position = "top")+
   labs(x = "Year",
       y = "Electricity Generartion\n(TWh)",
        color="Energy")+
  guides(color = guide_legend(label.position = "bottom",
                keywidth = 5))



Q3b

  • Replicate the following ggplot.

Answer:

Code
electricity <- electricity %>% 
  group_by(iso3c, year) %>% 
  mutate(pct = value / sum(value))


ggplot(data = electricity) +
  geom_line(aes(x = year,
                y = pct ,
                color = energy),
            linewidth=3) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 1985) +
facet_wrap(iso3c~.,) +
scale_colour_viridis_d()+
  theme(legend.position = "top")+
   labs(x = "Year",
       y = "Electricity Generartion\n(TWh)",
        color="Energy")+
  scale_y_continuous(labels = scales::percent) +
  guides(color = guide_legend(label.position = "bottom",
                keywidth = 5))




Question 4

The following data set is for Question 4:

Code
starbucks <- read_csv(
  'https://bcdanl.github.io/data/starbucks.csv')
Rows: 1116 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (4): product_name, size, trans_fat_g, fiber_g
dbl (11): milk, whip, serv_size_m_l, calories, total_fat_g, saturated_fat_g,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Variable description

  • Product_Name: Product Name
  • Size: Size of drink (short, tall, grande, venti)
  • Milk: Milk Type type of milk used
    • 0 none
    • 1 nonfat
    • 2 2%
    • 3 soy
    • 4 coconut
    • 5 whole
  • Whip: Whip added or not (binary 0/1)
  • Serv_Size_mL: Serving size in ml
  • Calories: KCal
  • Total_Fat_g: Total fat grams
  • Saturated_Fat_g: Saturated fat grams
  • Trans_Fat_g: Trans fat grams
  • Cholesterol_mg: Cholesterol mg
  • Sodium_mg: Sodium milligrams
  • Total_Carbs_g: Total Carbs grams
  • Fiber_g: Fiber grams
  • Sugar_g: Sugar grams
  • Caffeine_mg: Caffeine in milligrams

Q4a.

  • Add the following two variables to starbucks data.frame
    • caffeine_mgml: Caffeine in milligrams per mL
    • calories_kcml: Calories KCal per mL

Answer:

Code
starbucks <- starbucks |>
  mutate(caffeine_mgml = caffeine_mg/serv_size_m_l) |>
  mutate(calories_kcml = calories/serv_size_m_l)



Q4b.

  • Calculate a mean caffeine_mgml and a mean calories_kcml for each product_name.

Answer:

Code
starbucks_1 <- starbucks |>
  group_by(product_name) |>
  summarise(caffeine_mgml = mean(caffeine_mgml),
            calories_kcml = mean(calories_kcml),
            ) 



Q4c.

  • For the top 10 product_name in terms of caffeine_mgml and the top 10 product_name in terms of calories_kcml, replicate the following ggplot.

  • Use the following commands for showing texts in the plot:

Code
# install.packages("showtext")
library(showtext)
Warning: package 'showtext' was built under R version 4.3.3
Loading required package: sysfonts
Warning: package 'sysfonts' was built under R version 4.3.3
Loading required package: showtextdb
Warning: package 'showtextdb' was built under R version 4.3.3
Code
showtext_auto()
font_add_google("Annie Use Your Telescope", "annie")
Code
starbucks_2 <- starbucks_1 |>
  arrange(-caffeine_mgml)|>
  head(10)

starbucks_3 <- starbucks_1 |>
  arrange(-calories_kcml)|>
  head(10)

starbucks_4 <- rbind(starbucks_2, starbucks_3)

s<- ggplot(starbucks_4, 
           aes(x = calories_kcml, 
               y = caffeine_mgml) )+
  geom_point(aes(color = product_name))+
  geom_text_repel(aes(label = product_name, 
                      color = product_name),
                  family = "annie") +
  guides(color = "none") +
  labs(x = "Calories",
       y = "Caffeine\n(mgML)",
       title = "STARBUCKS DRINKS",
       subtitle = "Caffeine or Calories, which one would you go?")+
  annotate("richtext", 
           x = 0.6 , 
           y = 0.5 , 
           label = "<img src='https://bcdanl.github.io/lec_figs/starbucks.png' width='100'/>",
           color = NA) +
    annotate(geom = "rect", 
             xmin = 0, xmax = .34,
             ymin = .41, ymax = .8, 
             fill = "lightgray", 
             alpha = 0.5) +
   annotate(geom = "rect", 
             xmin = 0.68, xmax = .93,
             ymin = 0, ymax = .34, 
             fill = "lightgray", 
             alpha = 0.5)+
  theme(plot.title = element_text(colour ="#00704A" )  )


s


- Use the following `annotate()` geom to insert the starbucks image in the plot:

::: {.cell}

```{.r .cell-code}
annotate("richtext", 
           x = Calories , 
           y =  Caffeine, 
           label = "<img src='https://bcdanl.github.io/lec_figs/starbucks.png' width='100'/>", 
           fill =  ,
           size =  , 
           color =  )

:::

  • Use the following geom_text_repel() geom to use the annie font
Code
geom_text_repel(max.overlaps = ,
                  size =  ,
                  min.segment.length =  ,
                  point.padding =  ,
                  box.padding =  ,
                  show.legend =  ,
                  family = "annie")
  • Use the color, #00704A, for the title.

Answer: