crunchbase_female_founders.Rmd

---
title: "Get Crunchbase Data"
output: 
  html_document:
    code_folding: hide
---

```{r}
needs(tidyverse, RMySQL, dbConnect, gender, lubridate, wordcloud, forcats,
      viridis, stringr, magrittr, assertr, knitr)
source("data_assembly.R")
```

```{r}
# Connect to the local MySQL database with the Crunchbase 2013 data
con <- dbConnect(MySQL(), user='root', dbname='mytestdatabase')

# Geta all tables used
cb_tables <- c("cb_funding_rounds", "cb_investments", "cb_objects",
               "cb_people", "cb_relationships")
data <- cb_tables %>% map(get_cb_data, con)
names(data) <- cb_tables

# Assemble founders data
founders <- assemble_founders_data(
  data$cb_relationships, data$cb_people, data$cb_objects)
# Check that every founder appears only once per company
stopifnot(founders %>% nrow == 
            founders %>% select(person_id, company_id) %>% unique %>% nrow)

# Assemble rounds data
rounds <- assemble_rounds_data(
  data$cb_funding_rounds, data$cb_investments, data$cb_objects,
  years = 2009:2013, round_codes = c("seed", "a", "b", "c"))
# Check that every investor / company / round appears only once
stopifnot(rounds %>% nrow ==
            rounds %>% select(company_id, investor_id, funding_round_code) %>% unique %>% nrow)
  
# Join the two together and filter unknown gender
combined <- rounds %>%
  inner_join(founders, by = "company_id") %>%
  filter(!is.na(gender))

{
  "%s founders rows" %>% sprintf(nrow(founders)) %>% writeLines
  "%s rounds rows" %>% sprintf(nrow(rounds)) %>% writeLines
  "%s combined rows" %>% sprintf(nrow(combined)) %>% writeLines
}
```

### name confidence

```{r}
# Gender analysis confidence
founders %>% 
  mutate(confidence = ifelse(is.na(proportion_female), 'missing',
                             ifelse(proportion_female > .95, 'female',
                                    ifelse(proportion_female < .05, 'male',
                                           'uncertain')))) %>%
  group_by(confidence) %>%
  tally %>%
  ungroup %>%
  mutate(pct = n / sum(n))
```

### founder names

```{r}
# Wordcloud of founder names
df <- founders %>%
  group_by(first_name, gender) %>%
  tally %>%
  mutate(color = ifelse(
    is.na(gender), 'grey',
    ifelse(gender == 'female', 'green', 'orange'))) %>%
  ungroup %>%
  arrange(desc(n)) %>%
  slice(1:500) %>%
  mutate(s = sqrt(n))
r <- range(df$s)
r <- r / max(r) * 1.5
wordcloud(df$first_name, df$s, colors = df$color, 
          ordered.colors = TRUE, random.order = FALSE,
          scale = rev(r))
```

### Overall

```{r}
# Overall summary statistics
combined %>%
  select(investor_id, company_id, person_id, gender) %>%
  unique  %>% # dedupe over rounds
  summarize(
    num_investments = n(),    num_investments = n(),    
    num_investors = n_distinct(investor_id),
    num_companies = n_distinct(company_id),
    num_founders = n_distinct(person_id),
    pct_female = mean(gender == 'female') * 100
  )
```

### Number of female founders

```{r}
combined %>%
  filter(!is.na(gender)) %>%
  select(person_id, gender) %>%
  unique %>%
  group_by(gender) %>%
  tally
```

### Tags

```{r}
tags <- combined %>%
  filter(!is.na(gender)) %>%
  mutate(is_female = gender == 'female') %>%
  group_by(company_id, tag_list) %>%
  summarize(any_female = any(is_female)) %>%
  filter(!is.na(tag_list))
  
split_tags <- function(s) data_frame(tag = str_split(s, ', ')[[1]])

tags <- tags %>%
  mutate(tags = tag_list %>% map(split_tags)) %>%
  select(-tag_list) %>%
  unnest(tags)

tags %>%
  filter(tag != '' & !is.na(tag)) %>%
  group_by(tag) %>%
  summarize(
    num_companies = n(),
    pct_any_female = mean(any_female) * 100) %>%
  arrange(desc(num_companies)) %>%
  slice(1:50) %>%
  mutate(tag = tag %>% fct_reorder(pct_any_female)) %>%
  ggplot(aes(pct_any_female, tag)) +
    geom_segment(aes(xend = 0, yend = tag), colour = 'grey80', alpha = .8) +
    geom_point(aes(size = num_companies), colour = 'grey20', alpha = .8) +
    labs(x = "Percent Any Women Founder", y = "Crunchbase Tag",
         size = "Number of\nCompanies") +
    theme_minimal() +
    theme(panel.background = element_blank(),
          panel.border = element_blank(), 
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_blank()) +
    scale_x_continuous(breaks = seq(0, 60, by = 5))
```

### Number of investments in female founders

```{r}
combined %>%
  filter(gender == 'female') %>%
  summarize_gender() 

combined %>% summarize_gender()
```

### VC

```{r, fig.height = 5, fig.width = 8}
# Top 100 VCs
df_100_vcs <- combined %>%
  select(investor, company_id, person_id, gender) %>%
  unique %>% # dedupe over rounds
  group_by(investor) %>%
  summarize_gender %>%
  arrange(desc(num_companies)) %>%
  slice(1:100)
  
df_100_vcs %>%
  arrange(desc(pct_female)) %>%
  mutate(investor = paste0(1:n(), ". ", investor)) %>%  
  mutate(rank = 1:n()) %>%
  mutate(column = (rank - 1) %/% 20 + 1) %>%
  group_by(column) %>%
  arrange(desc(pct_female)) %>%
  mutate(col_rank = n():1) %>%
  ggplot(aes(pct_female, col_rank)) +
    geom_segment(aes(xend = 0, yend = col_rank), color = 'grey70') +
    geom_point(aes(size = num_companies), color = 'grey70') +
    geom_text(aes(x = 0, label = investor), adj = 0, nudge_y = .3, size = 2, color = 'grey20') +
    facet_grid(. ~ column) +
    theme(legend.position = 'none') +
    theme(panel.background = element_blank(),
          panel.border = element_blank(), 
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          axis.line = element_blank()) +
    labs(x = "Percent Women Founders")
```

```{r}
"%s min companies" %>% sprintf(df_100_vcs %>% pull(num_companies) %>% min)
"%s min founders" %>% sprintf(df_100_vcs %>% pull(num_founders) %>% min)
"%s max founders" %>% sprintf(df_100_vcs %>% pull(num_founders) %>% max)
```

Bottom 10 VCs

```{r}
df_100_vcs %>% arrange(pct_female) %>% slice(1:10)
```

Top 10 VCs

```{r}
df_100_vcs %>% arrange(desc(pct_female)) %>% slice(1:10)
```

Examine examples

```{r}
combined %>%
  filter(investor == 'Sutter Hill Ventures') %>%
  select(company_id, person_id, first_name, proportion_female, title) %>%
  unique %>%
  arrange(company_id, person_id) %>%
  kable
```

### region

```{r}
df <- combined %>%
  select(investor_id, company_id, person_id, gender, region) %>%
  unique %>% # dedupe over rounds
  group_by(region) %>%
  summarize_gender

df %>%
  filter(region != 'unknown') %>%
  arrange(desc(num_companies)) %>%
  slice(1:15) %>%
  ggplot(aes(num_companies, pct_female)) +
    geom_point(aes(size = num_companies), colour = 'grey60') +
    geom_text(aes(label = region), nudge_y = .5) +
    scale_x_log10(lim = c(20, 1500)) +
    annotation_logticks(side = 'b') +
    theme(legend.position = 'none') +
    labs(x = "Number of Companies", y = "Percent Women Founders")
```

### funding round

```{r, fig.height = 2.5, fig.width = 2.5}
df <- combined %>%
  group_by(funding_round_code) %>%
  summarize_gender

df %>%
  ggplot(aes(funding_round_code, pct_female)) +
    geom_bar(stat = 'identity', fill = 'grey60') +
    geom_text(aes(label = pct_female %>% round(1)), nudge_y = .5, colour = 'grey20', size = 3) +
    geom_text(aes(label = funding_round_code, y = 0), nudge_y = -.5, colour = 'grey20', size = 3) +
    labs(x = "Funding Round", y = "Percent Women Founders") +
    theme_minimal() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())
```

### year and month

```{r}
df <- combined %>%
  mutate(
    year = year(funding_date),
    month = month(funding_date)
  ) %>%
  group_by(year, month) %>%
  summarize_gender
  
df %>%
  ggplot(aes(month, pct_female, size = num_founders)) +
    geom_point(colour = 'grey30') +
    stat_smooth(se = FALSE, formula = y ~ 1, method = 'lm') +
    labs(x = "Funding Month", y = "Percent Women Founders") +
    facet_grid(. ~ year) +
    theme(panel.grid.major = element_blank(),
          panel.grid.minor = element_blank()) +
    theme(legend.position = 'bottom') +
    scale_y_continuous(lim = c(0, max(df$pct_female))) +
    scale_x_continuous(breaks = 1:12) +
    theme(axis.text.x = element_text(size = 6))
```

### Top and Bottom 10 VC name distributions

```{r}
top_invest <- df_100_vcs %>% arrange(desc(pct_female)) %>% slice(1:5) %>% pull(investor)
bot_invest <- df_100_vcs %>% arrange(pct_female) %>% slice(1:5) %>% pull(investor)

plot_invest_wordcloud <- function(cobined, invest, max_n) {

  df <- combined %>%
    filter(investor %in% invest) %>%
    select(person_id, first_name, gender) %>%
    unique %>% # dedupe over investors and rounds
    group_by(first_name, gender) %>%
    tally %>%
    mutate(color = ifelse(
      is.na(gender), 'grey',
      ifelse(gender == 'female', 'green', 'orange'))) %>%
    ungroup %>%
    arrange(desc(n)) %>%
    slice(1:500) %>%
    mutate(s = sqrt(n))
  
  r <- range(df$s)
  r <- r / sqrt(max_n) * 1.5
  wordcloud(df$first_name, df$s, colors = df$color, 
            ordered.colors = TRUE, random.order = FALSE,
            scale = rev(r), min.freq = 0)
}
set.seed(1)
opar <- par(mfrow = c(1, 2))
plot_invest_wordcloud(combined, top_invest, 12)
title("Top 5 VCs", adj = 0.4)
plot_invest_wordcloud(combined, bot_invest, 15)
title("Bottom 5 VCs", adj = 0.4)
par(opar)
```