Startup-Success-Prediction

import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import requests
from urllib.request import Request, urlopen
import json
import re
import glob
import os
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
from decimal import Decimal
import math
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy import misc
from matplotlib.colors import ListedColormap
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import r2_score, f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from IPython.display import Image, display  
from collections import Counter
import pydotplus 

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

1.1 Scraping - Selenium Infinite Scrolling

html_companies_page = 'https://finder.startupnationcentral.org/startups/search'
def get_driver():
    driver = webdriver.Firefox(executable_path=r'C:\Python34\geckodriver.exe')
    driver.get(html_companies_page)
    return driver

We couldnt get all pages with requests, so we had to use selenium to implement infinite scroll so we can get all companies urls and later on scrape data from this pages.

def infinite_scroll(driver):
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 2
    while (i < 50):
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
driver = get_driver()
infinite_scroll(driver)
#linkedin login
driver = get_driver()
username = driver.find_element_by_id('username')
username.send_keys('yair.sultanv@gmail.com')

time.sleep(0.5)

password = driver.find_element_by_id('password')
password.send_keys('yairsultan')

time.sleep(0.5)

log_in_button = driver.find_element_by_class_name('btn__primary--large')
log_in_button.click()

time.sleep(3)

1.2 Scraping - Get Companies Urls

def load_soup_object(html_file_name):
    html = open( html_file_name, encoding="utf8")
    soup = BeautifulSoup(html, 'html.parser')
    return soup

We had to split our requests to multiple files because after some time of infinite scorling we got blocked.

def extract_companies_urls(html_link):
    html_link = "./Data/companies_urls/" + html_link
    soup = load_soup_object(html_link)
    companie_div = soup("div",attrs = {"class" : 'js-company-cards-list'})[0]
    links = list()
    for row in companie_div('div',attrs = {"class" : 'box-view-item'}):
        url = row('a')[0]['href']
        links.append(url)
    return links
      

From each HTML page with multiple companies urls, we extracted the urls in to one big dataframe.

image-2.png

list_50_to_90 = extract_companies_urls('Start-Up1950-1990.html')
list_91_to_00 = extract_companies_urls('Start-Up1991-2000.html')
list_01_to_05 = extract_companies_urls('Start-Up2001-2005.html')
list_05_to_09 = extract_companies_urls('Start-Up2005-2009.html')
list_10_to_11 = extract_companies_urls('Start-Up2010-2011.html')
list_12 = extract_companies_urls('Start-Up2012.html')
list_13 = extract_companies_urls('Start-Up2013.html')
list_14 =extract_companies_urls('Start-Up2014.html')
list_15 = extract_companies_urls('Start-Up2015.html')
list_16 = extract_companies_urls('Start-Up2016.html')
list_17 = extract_companies_urls('Start-Up2017.html')
list_18 = extract_companies_urls('Start-Up2018.html')
list_19 = extract_companies_urls('Start-Up2018.html')
list_20_22 = extract_companies_urls('Start-Up2020-2022.html')
link_list = list_50_to_90 + list_91_to_00 +list_01_to_05+list_05_to_09+ list_10_to_11 + list_12+list_13 + list_14 +list_15 +list_16 +list_17 +list_18 + list_19 +list_20_22  
df = pd.DataFrame({'links' : link_list})
df = df.drop_duplicates().copy()
df = df.reset_index(drop=True)

We ended up with csv file with more then 10,300 urls and saved it.

df.to_csv("companies_urls.csv",index = False)

1.3 Scraping - Company Page Scraping

def load_csv(fileName):
    return pd.read_csv(fileName)
def load_soup_html(url):    
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    return soup

Let’s load our urls csv from previous stage :

df = load_csv("companies_urls.csv")

Let’s define a function that extracts all of the features we want from the company page :

def get_df_companies(df):
    res_df = pd.DataFrame(columns=['COMPANIE_NAME', 'FOUNDED', 'BUSINESS_MODEL', 'EMPLOYEES','FUNDING_STAGE','RAISED'
                                   , 'PRODUCT_STAGE', 'NEWS_COUNT', 'FOUNDER_COUNT', 'TOTAL_ROUNDS', 'INVESTORS_COUNT'
                                   , 'COMPANY_STATUS', 'IS_ACADEMIC', 'SECTOR', 'TARGET_INDUSTORY'])
    for url in df['links']:
        print(url)
        soup = load_soup_html(url)
        
        company_dict = {
            "NAME": None,
            "FOUNDED": None,
            "BUSINESS MODEL": None,
            "EMPLOYEES": None,
            "FUNDING STAGE": None,
            "RAISED": 0,
            "PRODUCT STAGE": None,
            "NEWS": 0,
            "FOUNDERS": 0,
            "Total rounds": 0,
            "Investors": 0,
            "STATUS": "Active",
            "IS ACADEMIC": 0,
            "SECTOR": None,
            "TARGET INDUSTORY": None
        }
        
        name_div = soup("div",attrs = {"class" : 'top-profile-section'})
        if(not name_div):
            continue
            
        company_dict["NAME"] = soup("div",attrs = {"class" : 'top-profile-section'})[0]("div",attrs = {"class" : 'title'})[0].get_text().strip()

        COMPANY_PROFILE = soup("div",attrs = {"class" : 'metadata-wrapper'})
        if(len(COMPANY_PROFILE)):
            for div in COMPANY_PROFILE[0]("div",attrs = {"class" : 'item-bottom'}):
                company_dict.update( {div.get_text().strip(): div.find_previous('div').get_text().strip()})
            
        if(soup("div", attrs = {"id": "news-section"})):
            news_div = soup("div", attrs = {"id": "news-section"})[0]
            news_str = news_div("div", attrs = {"class": "section-text"})[0].get_text().strip()
            company_dict["NEWS"] = int(re.findall('\d+', news_str)[0])
            
        if(soup("div", attrs = {"id": "team-section"})):
            team_div = soup("div", attrs = {"id": "team-section"})[0]
            team_str = team_div("div", attrs = {"class": "section-text"})[0].get_text().strip()
            company_dict["FOUNDERS"] = int(re.findall('\d+', team_str)[0])
         
        FUNDING_DATA = soup("div",attrs = {"class" : 'funding-metadata'})
        if(len(FUNDING_DATA)):
            for div in FUNDING_DATA[0]("div",attrs = {"class" : 'subtitle'}):
                funding_text = div.get_text().strip()
                if(funding_text == 'Total rounds' or funding_text == 'Investors'):
                    company_dict[funding_text] = div.find_previous('div').get_text().strip()

        if(soup.find_all("div", string=["Public"])):
            company_dict["STATUS"] = "Public"
        elif(soup.find_all("div", string=["Acquired by"])):
            company_dict["STATUS"] = "Acquired"
        elif(soup.find_all("div", string=["Not Active"])):
            company_dict["STATUS"] = "Not Active"
            
        if(soup.find_all("div", string=["Academic technology"])):
            company_dict["IS ACADEMIC"] = 1
            
        sector_div = soup.find_all("use", attrs={"xlink:href": "#icon-classification-sector"})
        if(sector_div):
            sector_div = sector_div[0].parent.parent.parent.parent("a", attrs = {"class": "lead-title"})
            company_dict["SECTOR"] = sector_div[0].get_text().strip()
        
        industry_div = soup.find_all("use", attrs={"xlink:href": "#icon-classification-industry"})
        if(industry_div):
            industry_div = industry_div[0].parent.parent.parent.parent("a", attrs = {"class": "lead-title"})
            company_dict["TARGET INDUSTORY"] = industry_div[0].get_text().strip()
        
        print(company_dict)
        res_df.loc[len(res_df)] = list(company_dict.values())
        
    return res_df

Main company page with some interesting data!

image-3.png

COMPANY PROFILE data and HTML locators

image.png

COMPANY FOUNDERS and Funding rounds timeline

image-4.png

Now, we are going run in a loop, each iteration will include 100 companies data extracting and we will wait a few seconds so we wont get blocked.

Each iteration is saved in a seperate DataFrame so we can be in control all the time and check the data we are getting. (this step lasted about 48 hours)

for i in range(0,104):
    df_res = get_df_companies(df.iloc[i*100: (i+1) * 100])
    df_res.to_csv("Data\companies_df\companies_" + str(i*100) + "-" + str((i*100) + 99) + ".csv",index = False)
    print("************************ file number:" + str(i) + "***********************\n\n")
    time.sleep(10)

Now after we finished scraping all of the data, lets combine all the files to one big DataFrame :

path = r'Data\companies_df'
all_files = sorted(glob.glob(os.path.join(path, "*.csv")), key=os.path.getmtime) #get all files sorted by time modified asc 

df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
# concatenated_df
filepath = r'Data\companies_df\main_df.csv'
concatenated_df.to_csv(filepath, index=False)

2. Data cleaning 🧹🧹🧹

After the scraping process we need to start the cleaning process.
main_df = pd.read_csv('Data/companies_df/main_df.csv')
main_df
COMPANIE_NAME FOUNDED BUSINESS_MODEL EMPLOYEES FUNDING_STAGE RAISED PRODUCT_STAGE NEWS_COUNT FOUNDER_COUNT TOTAL_ROUNDS INVESTORS_COUNT COMPANY_STATUS IS_ACADEMIC SECTOR TARGET_INDUSTORY
0 Golan Plastic Products 1/1964 B2B, B2G 11-50 Public 0 Released 1 4 0 0 Public 0 Industrial Technologies Energy, Utilities & Waste Management
1 Cham Foods 12/1970 B2B, B2B2C 11-50 Public 0 Released 2 2 0 0 Public 0 AgriFood-tech & Water Agriculture & Food
2 HerbaMed 1/1986 B2B, B2C 1-10 Revenue Financed 0 Released 2 2 0 0 Not Active 0 AgriFood-tech & Water Agriculture & Food
3 RAD 1/1981 B2B, B2G 500+ Established 0 Released 13 3 0 0 Active 0 Industrial Technologies Communication Services
4 RADA 12/1970 B2B, B2G 201-500 Public $35.3M Released 31 6 2 5 Public 0 Security Technologies Defense, Safety & Security
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10373 Expecting 1/2021 B2B, B2C 11-50 Seed $1M Released 2 2 1 0 Active 0 Life Sciences & HealthTech Consumers
10374 Loona 8/2020 B2B 1-10 Bootstrapped 0 Customer development 0 2 0 0 Active 0 Enterprise, IT & Data Infrastructure Industrial Manufacturing
10375 Quiz Beez 6/2021 B2C 1-10 Bootstrapped 0 Customer development 0 1 0 0 Active 0 Content & Media Education
10376 Eureka Security 10/2021 B2B 11-50 Seed $8M Released 2 2 1 7 Active 0 Security Technologies Enterprise & Professional Services
10377 Kitchezz 11/2020 B2B, B2C 1-10 Bootstrapped 0 Released 0 2 0 0 Active 0 Retail & Marketing Commerce & Retail

10378 rows × 15 columns

First, Let's look at none values for our companies.
We can see that we have up to 709 None values in some columns, so we will start our data cleaning by using dropna.
main_df.isnull().sum(axis = 0)
COMPANIE_NAME         0
FOUNDED             376
BUSINESS_MODEL      467
EMPLOYEES           408
FUNDING_STAGE       709
RAISED                0
PRODUCT_STAGE       569
NEWS_COUNT            0
FOUNDER_COUNT         0
TOTAL_ROUNDS          0
INVESTORS_COUNT       0
COMPANY_STATUS        0
IS_ACADEMIC           0
SECTOR              343
TARGET_INDUSTORY    394
dtype: int64
main_df.dropna(inplace=True)
print(main_df.isnull().sum(axis = 0))
print(len(main_df))
COMPANIE_NAME       0
FOUNDED             0
BUSINESS_MODEL      0
EMPLOYEES           0
FUNDING_STAGE       0
RAISED              0
PRODUCT_STAGE       0
NEWS_COUNT          0
FOUNDER_COUNT       0
TOTAL_ROUNDS        0
INVESTORS_COUNT     0
COMPANY_STATUS      0
IS_ACADEMIC         0
SECTOR              0
TARGET_INDUSTORY    0
dtype: int64
9210
Second, lets see if we can drop duplicate rows.
After removing Nan values we got total of 9210 rows.
main_df.drop_duplicates()
COMPANIE_NAME FOUNDED BUSINESS_MODEL EMPLOYEES FUNDING_STAGE RAISED PRODUCT_STAGE NEWS_COUNT FOUNDER_COUNT TOTAL_ROUNDS INVESTORS_COUNT COMPANY_STATUS IS_ACADEMIC SECTOR TARGET_INDUSTORY
0 Golan Plastic Products 1/1964 B2B, B2G 11-50 Public 0 Released 1 4 0 0 Public 0 Industrial Technologies Energy, Utilities & Waste Management
1 Cham Foods 12/1970 B2B, B2B2C 11-50 Public 0 Released 2 2 0 0 Public 0 AgriFood-tech & Water Agriculture & Food
2 HerbaMed 1/1986 B2B, B2C 1-10 Revenue Financed 0 Released 2 2 0 0 Not Active 0 AgriFood-tech & Water Agriculture & Food
3 RAD 1/1981 B2B, B2G 500+ Established 0 Released 13 3 0 0 Active 0 Industrial Technologies Communication Services
4 RADA 12/1970 B2B, B2G 201-500 Public $35.3M Released 31 6 2 5 Public 0 Security Technologies Defense, Safety & Security
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10373 Expecting 1/2021 B2B, B2C 11-50 Seed $1M Released 2 2 1 0 Active 0 Life Sciences & HealthTech Consumers
10374 Loona 8/2020 B2B 1-10 Bootstrapped 0 Customer development 0 2 0 0 Active 0 Enterprise, IT & Data Infrastructure Industrial Manufacturing
10375 Quiz Beez 6/2021 B2C 1-10 Bootstrapped 0 Customer development 0 1 0 0 Active 0 Content & Media Education
10376 Eureka Security 10/2021 B2B 11-50 Seed $8M Released 2 2 1 7 Active 0 Security Technologies Enterprise & Professional Services
10377 Kitchezz 11/2020 B2B, B2C 1-10 Bootstrapped 0 Released 0 2 0 0 Active 0 Retail & Marketing Commerce & Retail

9210 rows × 15 columns

We can see that after drop_duplicates function we still have 9210 rows which means our data scraping done optimally!!!

Now, because our 'RAISED' feature is not numeric, we need to transform our 'RAISED' value from string to int when we take into account the meaning of the letters ('K', 'M', 'B').
d = {'K': 3,'M': 6,'B': 9}
def text_to_num(text):
    if text == '0':return 0
    if text[-1] in d:
        num, magnitude = text[1:-1], text[-1]
        return int(float(num) * 10 ** d[magnitude])
    else:
        return int(0)
for idx, row in main_df.iterrows():
    raised = text_to_num(row['RAISED'])
    main_df.at[idx,'RAISED'] = raised

main_df["RAISED"] = pd.to_numeric(main_df["RAISED"])
main_df["RAISED"].describe().apply(lambda x: format(x, 'f'))
count          9210.000000
mean        9480790.375136
std        49695496.790866
min               0.000000
25%               0.000000
50%               0.000000
75%         1199000.000000
max      1247000000.000000
Name: RAISED, dtype: object
Now, lets extract our month and year from 'FUNDED' feature which looks like this "10/2021".
We need to seperate month and year from each string and push them into our dataframe.
def extract_month_and_year(val):
    lst = val.split('/')
    if len(lst) == 1:
        lst.insert(0,0)
    return int(lst[0]),int(lst[1])

months = list()
years = list()
for idx, row in main_df.iterrows():
    month,year = extract_month_and_year(row['FOUNDED'])
    months.append(month)
    years.append(year)
main_df.insert(loc=2, column='FOUNDED_MONTH', value=months) 
main_df.insert(loc=3, column='FOUNDED_YEAR', value=years) 
main_df.drop(columns = ['FOUNDED'], axis=1, inplace=True)
Let's have a look at our numeric features :
main_df.describe()
FOUNDED_MONTH FOUNDED_YEAR RAISED NEWS_COUNT FOUNDER_COUNT TOTAL_ROUNDS INVESTORS_COUNT IS_ACADEMIC
count 9210.000000 9210.000000 9.210000e+03 9210.000000 9210.000000 9210.000000 9210.000000 9210.0
mean 4.998697 2011.541151 9.480790e+06 2.831813 2.618675 0.930293 1.358958 0.0
std 3.775316 9.305594 4.969550e+07 5.668288 1.685656 1.505888 2.841590 0.0
min 0.000000 1950.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.0
25% 1.000000 2010.000000 0.000000e+00 0.000000 1.250000 0.000000 0.000000 0.0
50% 5.000000 2014.000000 0.000000e+00 1.000000 2.000000 0.000000 0.000000 0.0
75% 8.000000 2017.000000 1.199000e+06 3.000000 3.000000 1.000000 1.000000 0.0
max 12.000000 2022.000000 1.247000e+09 98.000000 41.000000 16.000000 32.000000 0.0
Now we have categorical feature 'BUSINESS_MODEL' that we want to transform to numeric true\false columns :
Buisness model feature has 4 possible categories when there are companies that are characterized by several different models.
def extract_business_model(main_df):
    b2b = list()
    b2c = list()
    b2g = list()
    b2b2c = list()
    for model in main_df['BUSINESS_MODEL']:
        lst = model.split(', ')
        b2b.append('B2B' in lst)
        b2c.append('B2C' in lst)
        b2g.append('B2G' in lst)
        b2b2c.append('B2B2C' in lst)
            
    return b2b, b2c, b2g ,b2b2c
    
    
b2b, b2c, b2g ,b2b2c = extract_business_model(main_df)
main_df.insert(loc=4, column='B2B', value=b2b) 
main_df.insert(loc=5, column='B2C', value=b2c) 
main_df.insert(loc=6, column='B2G', value=b2g) 
main_df.insert(loc=7, column='B2B2C', value=b2b2c) 
main_df['B2B'] = np.where(main_df['B2B'],1,0)
main_df['B2C'] = np.where(main_df['B2C'],1,0)
main_df['B2G'] = np.where(main_df['B2G'],1,0)
main_df['B2B2C'] = np.where(main_df['B2B2C'],1,0)
main_df.drop(columns = ['BUSINESS_MODEL'], axis=1, inplace=True)
'COMPANY_STATUS' is also categorical feature and we want to transform it to numeric true\false columns :
def extract_company_status(main_df):
    is_public = list()
    is_acquired = list()
    is_active = list()
    is_notActive = list()
    for status in main_df['COMPANY_STATUS']:
        is_public.append('Public' == status)
        is_acquired.append('Acquired' == status)
        is_active.append('Active' == status)
        is_notActive.append('Not Active' == status)
    
    is_public = np.where(is_public,1,0)
    is_acquired = np.where(is_acquired,1,0)
    is_active = np.where(is_active,1,0)
    is_notActive = np.where(is_notActive,1,0)
    
    return is_public, is_acquired, is_active ,is_notActive
is_public, is_acquired, is_active ,is_notActive = extract_company_status(main_df)
main_df.insert(loc=16, column='IS_PUBLIC', value=is_public) 
main_df.insert(loc=17, column='IS_ACQUIRED', value=is_acquired) 
main_df.insert(loc=18, column='IS_ACTIVE', value=is_active) 
main_df.insert(loc=19, column='IS_NOT_ACTIVE', value=is_notActive)
main_df.drop(columns = ['COMPANY_STATUS'], axis=1, inplace=True)

Let’s see what we got so far :

main_df
COMPANIE_NAME FOUNDED_MONTH FOUNDED_YEAR B2B B2C B2G B2B2C EMPLOYEES FUNDING_STAGE RAISED ... FOUNDER_COUNT TOTAL_ROUNDS INVESTORS_COUNT IS_PUBLIC IS_ACQUIRED IS_ACTIVE IS_NOT_ACTIVE IS_ACADEMIC SECTOR TARGET_INDUSTORY
0 Golan Plastic Products 1 1964 1 0 1 0 11-50 Public 0 ... 4 0 0 1 0 0 0 0 Industrial Technologies Energy, Utilities & Waste Management
1 Cham Foods 12 1970 1 0 0 1 11-50 Public 0 ... 2 0 0 1 0 0 0 0 AgriFood-tech & Water Agriculture & Food
2 HerbaMed 1 1986 1 1 0 0 1-10 Revenue Financed 0 ... 2 0 0 0 0 0 1 0 AgriFood-tech & Water Agriculture & Food
3 RAD 1 1981 1 0 1 0 500+ Established 0 ... 3 0 0 0 0 1 0 0 Industrial Technologies Communication Services
4 RADA 12 1970 1 0 1 0 201-500 Public 35300000 ... 6 2 5 1 0 0 0 0 Security Technologies Defense, Safety & Security
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10373 Expecting 1 2021 1 1 0 0 11-50 Seed 1000000 ... 2 1 0 0 0 1 0 0 Life Sciences & HealthTech Consumers
10374 Loona 8 2020 1 0 0 0 1-10 Bootstrapped 0 ... 2 0 0 0 0 1 0 0 Enterprise, IT & Data Infrastructure Industrial Manufacturing
10375 Quiz Beez 6 2021 0 1 0 0 1-10 Bootstrapped 0 ... 1 0 0 0 0 1 0 0 Content & Media Education
10376 Eureka Security 10 2021 1 0 0 0 11-50 Seed 8000000 ... 2 1 7 0 0 1 0 0 Security Technologies Enterprise & Professional Services
10377 Kitchezz 11 2020 1 1 0 0 1-10 Bootstrapped 0 ... 2 0 0 0 0 1 0 0 Retail & Marketing Commerce & Retail

9210 rows × 22 columns

main_df.describe(include='all')
COMPANIE_NAME FOUNDED_MONTH FOUNDED_YEAR B2B B2C B2G B2B2C EMPLOYEES FUNDING_STAGE RAISED ... FOUNDER_COUNT TOTAL_ROUNDS INVESTORS_COUNT IS_PUBLIC IS_ACQUIRED IS_ACTIVE IS_NOT_ACTIVE IS_ACADEMIC SECTOR TARGET_INDUSTORY
count 9210 9210.000000 9210.000000 9210.000000 9210.000000 9210.000000 9210.000000 9210 9210 9.210000e+03 ... 9210.000000 9210.000000 9210.000000 9210.000000 9210.000000 9210.000000 9210.000000 9210.0 9210 9210
unique 9199 NaN NaN NaN NaN NaN NaN 5 10 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 11 17
top Amigo NaN NaN NaN NaN NaN NaN 1-10 Bootstrapped NaN ... NaN NaN NaN NaN NaN NaN NaN NaN Life Sciences & HealthTech Consumers
freq 2 NaN NaN NaN NaN NaN NaN 5694 2655 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 1660 2640
mean NaN 4.998697 2011.541151 0.790662 0.338436 0.154397 0.145168 NaN NaN 9.480790e+06 ... 2.618675 0.930293 1.358958 0.035722 0.095331 0.585125 0.283822 0.0 NaN NaN
std NaN 3.775316 9.305594 0.406858 0.473203 0.361349 0.352290 NaN NaN 4.969550e+07 ... 1.685656 1.505888 2.841590 0.185606 0.293688 0.492727 0.450876 0.0 NaN NaN
min NaN 0.000000 1950.000000 0.000000 0.000000 0.000000 0.000000 NaN NaN 0.000000e+00 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 NaN NaN
25% NaN 1.000000 2010.000000 1.000000 0.000000 0.000000 0.000000 NaN NaN 0.000000e+00 ... 1.250000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 NaN NaN
50% NaN 5.000000 2014.000000 1.000000 0.000000 0.000000 0.000000 NaN NaN 0.000000e+00 ... 2.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.0 NaN NaN
75% NaN 8.000000 2017.000000 1.000000 1.000000 0.000000 0.000000 NaN NaN 1.199000e+06 ... 3.000000 1.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.0 NaN NaN
max NaN 12.000000 2022.000000 1.000000 1.000000 1.000000 1.000000 NaN NaN 1.247000e+09 ... 41.000000 16.000000 32.000000 1.000000 1.000000 1.000000 1.000000 0.0 NaN NaN

11 rows × 22 columns

We realized that 'IS_ACADEMIC' value is 0 for all the rows at this point, so we decided to drop the entire column.
print("Number of companies with IS_ACADEMIC = True :  ",sum(main_df['IS_ACADEMIC']==1))
main_df.drop(columns = ['IS_ACADEMIC'], axis=1, inplace=True)
Number of companies with IS_ACADEMIC = True :   0
'FUNDING_STAGE', 'EMPLOYEES' and 'PRODUCT_STAGE' are categorical features and we want to convert their values to numeric categories
Let's look at our unique values :
print(main_df['FUNDING_STAGE'].unique())
print(main_df['EMPLOYEES'].unique())
print(main_df['PRODUCT_STAGE'].unique())
['Public' 'Revenue Financed' 'Established' 'ROUND A' 'ROUND C+' 'Acquired'
 'Seed' 'ROUND B' 'Bootstrapped' 'Pre-Seed']
['11-50' '1-10' '500+' '201-500' '51-200']
['Released' 'Clinical Trial' 'R&D' 'Beta' 'Alpha' 'Customer development']
Acuired is not a funding stage, therefore we need to drop 8 records where funding stage = acuired
len(main_df[main_df['FUNDING_STAGE'] == 'Acquired'].index)
8
main_df.drop(main_df[main_df.FUNDING_STAGE == 'Acquired'].index, inplace=True)
Because these features are ordinal categorical we want to order them by numerical order :
employees_replace_map = {'1-10': 0, '11-50': 1, '51-200': 2, '201-500': 3, '500+': 4}
main_df['EMPLOYEES'].replace(employees_replace_map, inplace=True)

funding_replace_map = {'Bootstrapped': 0, 'Pre-Seed': 1, 'Seed': 2, 'ROUND A': 3, 'ROUND B': 4, 'ROUND C+': 5, 'Public': 6, 'Revenue Financed': 7, 'Established': 8}
main_df['FUNDING_STAGE'].replace(funding_replace_map, inplace=True)

stage_replace_map = {'Customer development': 0, 'R&D': 1, 'Clinical Trial': 2, 'Alpha': 3, 'Beta': 4, 'Released': 5}
main_df['PRODUCT_STAGE'].replace(stage_replace_map, inplace=True)
'SECTOR' and 'TARGET_INDUSTORY' are not ordinal categorical so we can give them codes in any order.
sector_replace_map = dict( enumerate(main_df['SECTOR'].astype('category').cat.categories ))
main_df['SECTOR'].replace(sector_replace_map, inplace=True)
print(sector_replace_map)
target_replace_map = dict( enumerate(main_df['TARGET_INDUSTORY'].astype('category').cat.categories ))
main_df['TARGET_INDUSTORY'].replace(target_replace_map, inplace=True)
print(target_replace_map)

{0: 'Aerospace & Aviation', 1: 'AgriFood-tech & Water', 2: 'Content & Media', 3: 'Energy-tech', 4: 'Enterprise, IT & Data Infrastructure', 5: 'FinTech', 6: 'Industrial Technologies', 7: 'Life Sciences & HealthTech', 8: 'Retail & Marketing', 9: 'Security Technologies', 10: 'Smart Mobility'}
{0: 'Agriculture & Food', 1: 'Commerce & Retail', 2: 'Communication Services', 3: 'Consumers', 4: 'Defense, Safety & Security', 5: 'Education', 6: 'Energy, Utilities & Waste Management', 7: 'Enterprise & Professional Services', 8: 'Financial Services', 9: 'Food Retail & Consumption', 10: 'Government & City', 11: 'Healthcare & Life Sciences', 12: 'Industrial Manufacturing', 13: 'Media & Entertainment', 14: 'Real Estate & Construction', 15: 'Transportation & Logistics', 16: 'Travel & Tourism'}
def get_age(year_list, month_list):
    age = list()
    for year, month in zip(year_list, month_list):
        age.append((2022 - year) * 365 + (6 - month) * 30)
    return age
We see that we have some months with 0 value so we will replace them with the mean value.
main_df.loc[main_df['FOUNDED_MONTH'] == 0, ['FOUNDED_MONTH']] = round(main_df['FOUNDED_MONTH'].mean())
And now lets add 'AGE' feature which means how old the company is in days.
age = get_age(main_df['FOUNDED_YEAR'], main_df['FOUNDED_MONTH'])
main_df.insert(loc=3, column='AGE', value=age) 
main_df
COMPANIE_NAME FOUNDED_MONTH FOUNDED_YEAR AGE B2B B2C B2G B2B2C EMPLOYEES FUNDING_STAGE ... NEWS_COUNT FOUNDER_COUNT TOTAL_ROUNDS INVESTORS_COUNT IS_PUBLIC IS_ACQUIRED IS_ACTIVE IS_NOT_ACTIVE SECTOR TARGET_INDUSTORY
0 Golan Plastic Products 1 1964 21320 1 0 1 0 1 6 ... 1 4 0 0 1 0 0 0 Industrial Technologies Energy, Utilities & Waste Management
1 Cham Foods 12 1970 18800 1 0 0 1 1 6 ... 2 2 0 0 1 0 0 0 AgriFood-tech & Water Agriculture & Food
2 HerbaMed 1 1986 13290 1 1 0 0 0 7 ... 2 2 0 0 0 0 0 1 AgriFood-tech & Water Agriculture & Food
3 RAD 1 1981 15115 1 0 1 0 4 8 ... 13 3 0 0 0 0 1 0 Industrial Technologies Communication Services
4 RADA 12 1970 18800 1 0 1 0 3 6 ... 31 6 2 5 1 0 0 0 Security Technologies Defense, Safety & Security
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10373 Expecting 1 2021 515 1 1 0 0 1 2 ... 2 2 1 0 0 0 1 0 Life Sciences & HealthTech Consumers
10374 Loona 8 2020 670 1 0 0 0 0 0 ... 0 2 0 0 0 0 1 0 Enterprise, IT & Data Infrastructure Industrial Manufacturing
10375 Quiz Beez 6 2021 365 0 1 0 0 0 0 ... 0 1 0 0 0 0 1 0 Content & Media Education
10376 Eureka Security 10 2021 245 1 0 0 0 1 2 ... 2 2 1 7 0 0 1 0 Security Technologies Enterprise & Professional Services
10377 Kitchezz 11 2020 580 1 1 0 0 0 0 ... 0 2 0 0 0 0 1 0 Retail & Marketing Commerce & Retail

9202 rows × 22 columns

3. Outliers 🤥⛔

In this secion, we are going to:

sns.boxplot(main_df.RAISED)
plt.show()
sns.boxplot(main_df.NEWS_COUNT)
plt.show()
sns.boxplot(main_df.FOUNDER_COUNT)
plt.show()
sns.boxplot(main_df.INVESTORS_COUNT)
plt.show()

png

png

png

png

‘RAISED’

Q1 = np.percentile(main_df["RAISED"], 25)
Q3 = np.percentile(main_df["RAISED"], 75)
IQR = Q3 - Q1
print(f"IQR value: {IQR}\nQ1 value: {Q1}\nQ3 value: {Q3}")
Fare_outlier_rows = main_df[(main_df["RAISED"] < Q1 - 1.5*IQR) | (main_df["RAISED"] > Q3 + 1.5*IQR )].index
print("Total sum of outliers detected: ",len(Fare_outlier_rows))
IQR value: 1200000.0
Q1 value: 0.0
Q3 value: 1200000.0
Total sum of outliers detected:  1763

With IQR we got 1763 outliers which is too many, lets try z-score:

z_score = (main_df["RAISED"] - main_df["RAISED"].mean()) / main_df["RAISED"].std()
outliers = abs(z_score) > 3
print("Total outliers: ", sum(outliers))
main_df.drop(main_df[outliers].index, axis=0, inplace=True)
print("110 outliers are much better for us so we will use z-score")
Total outliers:  110
110 outliers are much better for us so we will use z-score

‘NEWS_COUNT’

z_score = (main_df["NEWS_COUNT"] - main_df["NEWS_COUNT"].mean()) / main_df["NEWS_COUNT"].std()
outliers = abs(z_score) > 4
print("Total outliers: ", sum(outliers))
main_df.drop(main_df[outliers].index, axis=0, inplace=True)
Total outliers:  111

‘FOUNDER_COUNT’

z_score = (main_df["FOUNDER_COUNT"] - main_df["FOUNDER_COUNT"].mean()) / main_df["FOUNDER_COUNT"].std()
outliers = abs(z_score) > 4
print("Total outliers: ", sum(outliers))
main_df.drop(main_df[outliers].index, axis=0, inplace=True)
Total outliers:  54

‘INVESTORS_COUNT’

z_score = (main_df["INVESTORS_COUNT"] - main_df["INVESTORS_COUNT"].mean()) / main_df["INVESTORS_COUNT"].std()
outliers = abs(z_score) > 4
print("Total outliers: ", sum(outliers))
main_df.drop(main_df[outliers].index, axis=0, inplace=True)
Total outliers:  106

Let’s look at our features after cleaning outliers:

sns.boxplot(main_df.RAISED)
plt.show()
sns.boxplot(main_df.NEWS_COUNT)
plt.show()
sns.boxplot(main_df.FOUNDER_COUNT)
plt.show()
sns.boxplot(main_df.INVESTORS_COUNT)
plt.show()

png

png

png

png

We can see that our outliers graph for numeric columns are much better now!
Data cleaning is finished, le'ts fix our indexes and save the final data frame.
main_df.reset_index(inplace=True)
main_df
index COMPANIE_NAME FOUNDED_MONTH FOUNDED_YEAR AGE B2B B2C B2G B2B2C EMPLOYEES ... NEWS_COUNT FOUNDER_COUNT TOTAL_ROUNDS INVESTORS_COUNT IS_PUBLIC IS_ACQUIRED IS_ACTIVE IS_NOT_ACTIVE SECTOR TARGET_INDUSTORY
0 0 Golan Plastic Products 1 1964 21320 1 0 1 0 1 ... 1 4 0 0 1 0 0 0 Industrial Technologies Energy, Utilities & Waste Management
1 1 Cham Foods 12 1970 18800 1 0 0 1 1 ... 2 2 0 0 1 0 0 0 AgriFood-tech & Water Agriculture & Food
2 2 HerbaMed 1 1986 13290 1 1 0 0 0 ... 2 2 0 0 0 0 0 1 AgriFood-tech & Water Agriculture & Food
3 3 RAD 1 1981 15115 1 0 1 0 4 ... 13 3 0 0 0 0 1 0 Industrial Technologies Communication Services
4 5 Ham-Let 5 1950 26310 1 0 0 0 3 ... 7 4 0 0 0 1 0 0 Industrial Technologies Energy, Utilities & Waste Management
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8816 10373 Expecting 1 2021 515 1 1 0 0 1 ... 2 2 1 0 0 0 1 0 Life Sciences & HealthTech Consumers
8817 10374 Loona 8 2020 670 1 0 0 0 0 ... 0 2 0 0 0 0 1 0 Enterprise, IT & Data Infrastructure Industrial Manufacturing
8818 10375 Quiz Beez 6 2021 365 0 1 0 0 0 ... 0 1 0 0 0 0 1 0 Content & Media Education
8819 10376 Eureka Security 10 2021 245 1 0 0 0 1 ... 2 2 1 7 0 0 1 0 Security Technologies Enterprise & Professional Services
8820 10377 Kitchezz 11 2020 580 1 1 0 0 0 ... 0 2 0 0 0 0 1 0 Retail & Marketing Commerce & Retail

8821 rows × 23 columns

main_df.to_csv('Data/companies_df/clean_df.csv', index=False)

4. EDA 📊📈📉

df = pd.read_csv('Data/companies_df/clean_df.csv')
df_raised_money = df[df['RAISED'] > 0]
df_didnt_raised_money = df[df['RAISED'] == 0]
print("Total number of companies after cleaning the data: ", len(df))
print("Number of companies who raised money: ",len(df_raised_money))
print("Number of companies who raised money: ",len(df_didnt_raised_money))
Total number of companies after cleaning the data:  8821
Number of companies who raised money:  2905
Number of companies who raised money:  5916

Let’s look at the Status distribution of companies with RAISED money value

fig, axs = plt.subplots(1,3)
fig.subplots_adjust(0.3,0,3,2)

labels = ['IS_PUBLIC', 'IS_ACTIVE', 'IS_ACQUIRED', 'IS_NOT_ACTIVE']

sizes = [len(df[df['IS_PUBLIC'] == 1]), len(df[df['IS_ACTIVE'] == 1]),len(df[df['IS_ACQUIRED'] == 1]), len(df[df['IS_NOT_ACTIVE'] == 1])]
axs[0].pie(sizes, labels=labels,autopct='%1.1f%%',  shadow=True)
axs[0].set_title("All companies", bbox={'facecolor':'0.8', 'pad':5})

sizes_raised = [len(df_raised_money[df_raised_money['IS_PUBLIC'] == 1]), len(df_raised_money[df_raised_money['IS_ACTIVE'] == 1]),len(df_raised_money[df_raised_money['IS_ACQUIRED'] == 1]), len(df_raised_money[df_raised_money['IS_NOT_ACTIVE'] == 1])]
axs[1].pie(sizes_raised, labels=labels,autopct='%1.1f%%',  shadow=True)
axs[1].set_title("Companies with Raised value", bbox={'facecolor':'0.8', 'pad':5})

sizes_didnt_raised = [len(df_didnt_raised_money[df_didnt_raised_money['IS_PUBLIC'] == 1]), len(df_didnt_raised_money[df_didnt_raised_money['IS_ACTIVE'] == 1]),len(df_didnt_raised_money[df_didnt_raised_money['IS_ACQUIRED'] == 1]), len(df_didnt_raised_money[df_didnt_raised_money['IS_NOT_ACTIVE'] == 1])]
axs[2].pie(sizes_didnt_raised, labels=labels,autopct='%1.1f%%',  shadow=True)
axs[2].set_title("Companies without Raised value", bbox={'facecolor':'0.8', 'pad':5})

plt.show()

png

As we can see, companies who raised money are more likely to be aquired or public.

Let’s define what a successful company is:

## - If the company is acquired or public we will consider it a successful company.

df_succeeded_companies = df[(df.IS_ACQUIRED == 1) | (df.IS_PUBLIC == 1)]
df_unsucceeded_companies = df[(df.IS_ACQUIRED == 0) & (df.IS_PUBLIC == 0)]
print("The number of succeeded companies is: ",len(df_succeeded_companies))
print("The number of unSucceeded companies is: ",len(df_unsucceeded_companies))
The number of succeeded companies is:  1073
The number of unSucceeded companies is:  7748

After we defined what a successful company is, we need to convert our 4 status columns to ‘is_successful’ column.

def extract_company_status(main_df):
    is_successful = list()

    for index, row in df.iterrows():
        if(row['IS_PUBLIC'] | row['IS_ACQUIRED']):
            is_successful.append(1)
        else:
            is_successful.append(0)
    
    return is_successful
is_successful = extract_company_status(df)
df.insert(loc=16, column='IS_SUCCESSFUL', value=is_successful)
df.drop(columns = ['IS_PUBLIC'], axis=1, inplace=True)
df.drop(columns = ['IS_ACQUIRED'], axis=1, inplace=True)
df.drop(columns = ['IS_ACTIVE'], axis=1, inplace=True)
df.drop(columns = ['IS_NOT_ACTIVE'], axis=1, inplace=True)
df.drop(columns = ['COMPANIE_NAME'], axis=1, inplace=True)

We will now examine whether the number of employees affects the company’s success.

ct = pd.crosstab(df['EMPLOYEES'], (df['IS_SUCCESSFUL']), normalize="index")

ax = ct.plot(kind="bar", figsize=(18,6), label=["1-10","11-50","51-200","201-500","500+"])
ax.legend(["Unsuccessful", "Successful"],fancybox=True, framealpha=1, shadow=True, borderpad=1)
for container in ax.containers:
    ax.bar_label(container)

plt.title("Number of employees in successful companies", fontsize = 20)
plt.ylabel("Relative frequency")
Text(0, 0.5, 'Relative frequency')

png

It can certainly be seen that the larger the number of employees, the greater the chances of success.

We tried to check if there is a relationship between the age of a company and the number of company investors

x = df_raised_money.AGE
y = df_raised_money.RAISED
A = np.vstack([x, np.ones(len(x))]).T

y = y[:, np.newaxis]
alpha = np.dot((np.dot(np.linalg.inv(np.dot(A.T,A)),A.T)),y)

plt.figure(figsize = (8,6))
plt.plot(x, y, 'b.')
plt.plot(x, alpha[0]*x + alpha[1], 'r')
plt.xlabel('Age in days')
plt.ylabel('Amount of money raised in Billion $')
plt.show()

png

From the graph it can be seen that there is a real connection between the age of the company in days and the amount of money that the company raised. It can be understood from this graph that the longer a company exists, the more likely it is to raise money. And as we have seen before, companies that raise money are more likely to succeed.

We are now trying to test relationships and behaviors between quantitative variables. *

corr = df[['EMPLOYEES', 'NEWS_COUNT', 'FOUNDER_COUNT','INVESTORS_COUNT']].corr()
mask = np.zeros_like(corr)
with sns.axes_style("darkgrid"):
    f, ax = plt.subplots(figsize=(15, 15))
    ax = sns.heatmap(corr, mask=mask, vmax=1, square=True,annot=True)
    
# df = pd.DataFrame(df)
# sns.heatmap(df.corr(), annot=True)

png

We were able to find out that there is a strong connection between the number of employees in the company and the number of news articles about the company.

ct2 = pd.crosstab(df[(df['RAISED'] == 0)]['TARGET_INDUSTORY'], df[(df['RAISED'] == 0)]['SECTOR'])
ct2
SECTOR Aerospace & Aviation AgriFood-tech & Water Content & Media Energy-tech Enterprise, IT & Data Infrastructure FinTech Industrial Technologies Life Sciences & HealthTech Retail & Marketing Security Technologies Smart Mobility
TARGET_INDUSTORY
Agriculture & Food 11 343 1 10 11 3 34 8 1 4 4
Commerce & Retail 1 8 53 10 86 20 44 61 189 19 10
Communication Services 3 0 15 3 24 2 61 2 5 13 5
Consumers 4 32 711 23 169 163 21 293 222 85 87
Defense, Safety & Security 34 0 3 8 16 1 55 11 0 112 8
Education 0 1 33 0 7 1 3 13 0 7 0
Energy, Utilities & Waste Management 2 40 1 65 5 0 40 2 0 17 1
Enterprise & Professional Services 0 2 108 8 552 41 17 10 235 169 5
Financial Services 0 0 1 0 29 80 1 4 5 14 3
Food Retail & Consumption 0 17 5 1 7 0 3 2 15 0 0
Government & City 0 7 2 5 3 1 5 2 1 8 10
Healthcare & Life Sciences 0 12 4 4 12 2 40 608 0 1 0
Industrial Manufacturing 9 8 0 14 12 0 100 1 2 13 23
Media & Entertainment 0 0 126 0 4 1 1 0 24 0 1
Real Estate & Construction 0 4 1 11 9 9 31 0 6 4 0
Transportation & Logistics 2 0 1 0 5 1 6 0 5 3 15
Travel & Tourism 0 0 6 0 12 1 1 1 19 3 0

In this graph we see the distribution of companies by target industory

fig, ax = plt.subplots(figsize=(18, 4))
fg = sns.histplot(df['TARGET_INDUSTORY'], ax=ax)
fg.set_title("Target Industory Histogram")
fg.set_xlabel("Target Industory")
plt.xticks(rotation=90, ha='right', rotation_mode='anchor')
plt.show()

png

We can seen that the most common sectors are:

   1. Consumers
   2. Enterprise & Professional Services
   3. Life Sciences & HealthTech

We will try to examine information with a large number of columns, to check if the number of companies is divided into several subgroups we will do this using PCA

EDA - Principle Component Analysis (PCA)

Step 1

We create a dataframe containing some numerical variables of our data set:

dataset = df.loc[:,[ 'NEWS_COUNT', 'FOUNDER_COUNT', 'TOTAL_ROUNDS', 'INVESTORS_COUNT','EMPLOYEES','PRODUCT_STAGE']]
dataset.shape
(8821, 6)

Step 2

We now need to create a PCA object, and then call the function that performs PCA on the dataset. the parameter n_componenets, which determines the number of dimesions we would like to have in the end:

pca2 = PCA(n_components=2) #creating a PCA object, while determining the desired number of dimensions
pcComponents = pca2.fit_transform(dataset) #performing PCA using fit_transform on our dataset

PCA creates new axes, hence, new variables - pcComponents is the new numerical data, with two dimensions:

pcComponents.shape
(8821, 2)

Step 3

To make it easy to display our results, we will create a new dataframe with the new features:

principalDf = pd.DataFrame(data = pcComponents, columns = ['principal component 1', 'principal component 2'])
principalDf
principal component 1 principal component 2
0 -1.057272 -0.672293
1 -0.516033 -1.171117
2 -0.620490 -1.167730
3 9.815548 -5.517506
4 4.519727 -3.071575
... ... ...
8816 -0.311000 -0.760671
8817 -2.873409 -0.201562
8818 -3.050129 -0.251598
8819 1.976308 4.964279
8820 -2.409851 -0.370227

8821 rows × 2 columns

Step 4

We also add the IS_SUCCESSFUL feature, so we will be able to display the data separating successful and unseccessful comapnies:

finalDf = pd.concat([principalDf, df[['IS_SUCCESSFUL']]], axis = 1)

Step 5

We are ready to see our results. We will use a scatterplot:

fig = plt.figure()
ax = plt.axes()
colormap = np.array(['r', 'b'])
ax.scatter(finalDf['principal component 1'], finalDf['principal component 2'], c=colormap[finalDf.IS_SUCCESSFUL])
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

png

As you can see, we can’t see in this figure a clear separation of successful and unsuccessful companies.

Chi-Square Test of Independence.

We will try to check if there is a relationship between categorical variables.

*EMPLOYEES column
*PRODUCT_STAGE column
ct1 = pd.crosstab(df.PRODUCT_STAGE, df.EMPLOYEES)
ct1
EMPLOYEES 0 1 2 3 4
PRODUCT_STAGE
0 196 8 1 0 0
1 1084 119 2 0 0
2 177 65 3 0 0
3 286 22 1 1 0
4 700 108 3 0 0
5 3234 2037 601 118 55
chi2_contingency(ct1)
(1071.8373864739401,
 1.8320493744078054e-214,
 20,
 array([[1.31933454e+02, 5.48231493e+01, 1.41996372e+01, 2.76555946e+00,
         1.27819975e+00],
        [7.75511280e+02, 3.22253146e+02, 8.34661603e+01, 1.62560934e+01,
         7.51332049e+00],
        [1.57676567e+02, 6.55203492e+01, 1.69702982e+01, 3.30518082e+00,
         1.52760458e+00],
        [1.99509126e+02, 8.29032989e+01, 2.14726222e+01, 4.18206553e+00,
         1.93288743e+00],
        [5.21941617e+02, 2.16885727e+02, 5.61751502e+01, 1.09408230e+01,
         5.05668292e+00],
        [3.89042796e+03, 1.61661433e+03, 4.18716132e+02, 8.15502777e+01,
         3.76913048e+01]]))
df.to_csv('Data/companies_df/eda_df.csv', index=False)

5. Machine Learning 🤖🤖🤖

df = pd.read_csv('Data/companies_df/eda_df.csv')

To start with the machine learning train and prediction, we need to convert our 4 status columns to ‘is_successful’ column.

'SECTOR' and 'TARGET_INDUSTORY' are not ordinal categorical so we can give them codes in any order.
sector_replace_map = dict( enumerate(df['SECTOR'].astype('category').cat.categories ))
sector_replace_map = dict([(value, key) for key, value in sector_replace_map.items()])
df['SECTOR'].replace(sector_replace_map, inplace=True)

target_replace_map = dict( enumerate(df['TARGET_INDUSTORY'].astype('category').cat.categories ))
target_replace_map = dict([(value, key) for key, value in target_replace_map.items()])
df['TARGET_INDUSTORY'].replace(target_replace_map, inplace=True)

We need to predict the ‘IS_SUCCESSFUL’ column. Let us separate it and assign it to a target variable ‘y’. The rest of the data frame will be the set of input variables X.

y = df["IS_SUCCESSFUL"].values
x = df.drop(["IS_SUCCESSFUL"],axis=1)

Now let’s scale the predictor variables and then separate the training and the testing data.

#Divide into training and test data
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0 ,test_size = 0.3) # 70% training and 30% test

Logistic Regression

clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
acc = clf.score(X_test,y_test)
print(f"Accuracy of our model using logistic regression: {acc}")
Accuracy of our model using logistic regression: 0.8734416320362675

Lets display our predicted result with confusion matrix

y_predict = clf.predict(X_test)
cf_matrix = metrics.confusion_matrix(y_test, y_predict)

ax= plt.subplot()
sns.heatmap(cf_matrix, annot=True, fmt='g', ax=ax, cmap='Blues');  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); ax.set_title('Confusion Matrix'); ax.xaxis.set_ticklabels(['False','True']);ax.yaxis.set_ticklabels(['False','True']);

png

print("accuracy is:",metrics.accuracy_score(y_test, y_predict))
print("precision is:",metrics.precision_score(y_test, y_predict))
print("recall is:",metrics.recall_score(y_test, y_predict))
print("f1 is:",metrics.f1_score(y_test, y_predict))
accuracy is: 0.8734416320362675
precision is: 0.5
recall is: 0.03582089552238806
f1 is: 0.06685236768802229

As we can see the results are not so great, lets try make some changes:

First, We found that ‘TARGET_INDUSTORY’ and ‘FOUNDED_YEAR’ features degrades model performance so we will drop them.

Second, We want to scale our ‘RAISED’ feature because it has very high values.

df.drop(columns = ['TARGET_INDUSTORY'], axis=1, inplace=True)
df.drop(columns = ['FOUNDED_YEAR'], axis=1, inplace=True)

df[['RAISED']] = minmax_scale(df[['RAISED']])


y = df["IS_SUCCESSFUL"].values
x = df.drop(["IS_SUCCESSFUL"],axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0 ,test_size = 0.3) # 70% training and 30% test

clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)

ax= plt.subplot()
cf_matrix = metrics.confusion_matrix(y_test, y_predict)
sns.heatmap(cf_matrix, annot=True, fmt='g', ax=ax, cmap='Blues');  #annot=True to annotate cells, ftm='g' to disable scientific notation
# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); ax.set_title('Confusion Matrix'); ax.xaxis.set_ticklabels(['False','True']);ax.yaxis.set_ticklabels(['False','True']);

png

Now lets look at the results again.

print("accuracy is:",metrics.accuracy_score(y_test, y_predict))
print("precision is:",metrics.precision_score(y_test, y_predict))
print("recall is:",metrics.recall_score(y_test, y_predict))
print("f1 is:",metrics.f1_score(y_test, y_predict))
accuracy is: 0.8866641480921799
precision is: 0.6446280991735537
recall is: 0.23283582089552238
f1 is: 0.3421052631578947

As we can see we got about 20-40% higher results!!!

KNN - K-Nearest Neighbors

# set up the model, k-NN classification with k = ?  
k = 3
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)

cf_matrix = metrics.confusion_matrix(y_true = y_test, y_pred = y_predict)

print('Accuracy = ', metrics.accuracy_score(y_true = y_test, y_pred = y_predict))

Accuracy =  0.8587079712882508

Let us take a few possible values of k and fit the model on the training data for all those values. We will also compute the training score and testing score for all those values.

train_score = []
test_score = []
k_vals = []

for k in range(1, 75):
    k_vals.append(k)
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    
    tr_score = knn.score(X_train, y_train)
    train_score.append(tr_score)
    
    te_score = knn.score(X_test, y_test)
    test_score.append(te_score)

print(f"Max train_score is: {max(train_score)}\nMax test_score is: {max(test_score)}")
Max train_score is: 1.0
Max test_score is: 0.8745749905553457
plt.figure(figsize=(10,5))
plt.xlabel('Different Values of K')
plt.ylabel('Model score')
plt.plot(k_vals, train_score, color = 'r', label = "training score")
plt.plot(k_vals, test_score, color = 'b', label = 'test score')
plt.legend(bbox_to_anchor=(1, 1),
           bbox_transform=plt.gcf().transFigure)
plt.show()

png

We found the best K for our model!
K = 10
Accuracy = 0.8745749905553457

knn = KNeighborsClassifier(n_neighbors = 10)

#Fit the model
knn.fit(X_train,y_train)

#get the score
knn.score(X_test,y_test)

0.8745749905553457

We can make the following conclusions from the above plot:

Decision Trees

def splitData(df, features, labels, specifed_random_state=0):
    """Split a subset of the dataset, given by the features, into train and test sets."""
    df_predictors = df[features].values
    df_labels = df[labels].values

    # Split into training and test sets
    XTrain, XTest, yTrain, yTest = train_test_split(df_predictors, df_labels, random_state=specifed_random_state)
    return XTrain, XTest, yTrain, yTest
def renderTree(my_tree, features):
    # hacky solution of writing to files and reading again
    # necessary due to library bugs
    filename = "temp.dot"
    with open(filename, 'w') as f:
        f = tree.export_graphviz(my_tree, 
                                 out_file=f, 
                                 feature_names=features, 
                                 class_names=["Perished", "Survived"],  
                                 filled=True, 
                                 rounded=True,
                                 special_characters=True)
  
    dot_data = ""
    with open(filename, 'r') as f:
        dot_data = f.read()

    graph = pydotplus.graph_from_dot_data(dot_data)
    image_name = "temp.png"
    graph.write_png(image_name)  
    display(Image(filename=image_name))

And here you can see the decision tree model with the prediction and accuracy of the training and testing of the model based on ‘NEWS_COUNT’ feature only.

decisionTree = tree.DecisionTreeClassifier()
features = ['NEWS_COUNT']

XTrain, XTest, yTrain, yTest = splitData(df, features, ["IS_SUCCESSFUL"])
# fit the tree with the traing data
decisionTree = decisionTree.fit(XTrain, yTrain)

# predict with the training data
y_pred_train = decisionTree.predict(XTrain)
# measure accuracy
print('Accuracy on training data = ', 
      metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))

# predict with the test data
y_pred = decisionTree.predict(XTest)
# measure accuracy
print('Accuracy on test data = ', 
      metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

renderTree(decisionTree, features)
Accuracy on training data =  0.8808767951625095
Accuracy on test data =  0.8739800543970988

png

decisionTree = tree.DecisionTreeClassifier()
all_features = df.columns.tolist()

all_features.remove('IS_SUCCESSFUL')

# fit the tree with the traing data
decisionTree = decisionTree.fit(X_train,y_train)

# predict with the training data
y_predict_train = decisionTree.predict(X_train)
# measure accuracy
print('Accuracy on training data = ', 
      metrics.accuracy_score(y_true = y_train, y_pred = y_predict_train))

# predict with the test data
y_predict = decisionTree.predict(X_test)
# measure accuracy
print('Accuracy on test data = ', 
      metrics.accuracy_score(y_true = y_test, y_pred = y_predict))

renderTree(decisionTree,all_features)
Accuracy on training data =  1.0
Accuracy on test data =  0.8617302606724594
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.899402 to fit

png

OK, clearly, we’re overfitting the data - 100% accuracy on the training data and only ~86% on the test data. Yet, we’ve created a complicated tree.

decisionTree = tree.DecisionTreeClassifier(max_depth=5, min_samples_split=20)
XTrain, XTest, yTrain, yTest = splitData(df, all_features, ["IS_SUCCESSFUL"])
decisionTree = decisionTree.fit(XTrain, yTrain)

y_pred_train = decisionTree.predict(XTrain)
print('Accuracy on training data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))

y_pred = decisionTree.predict(XTest)
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))

renderTree(decisionTree, all_features)

Accuracy on training data=  0.8946334089191232
Accuracy on test data=  0.8907524932003626

png

Slight improvement 89% for training and test without overfitting

We got ourselves a better training accuracy but the test prediction did not improve by a noticiable percentage.

Naive Bayes

# Split into training and test sets
y = df["IS_SUCCESSFUL"].values
x = df.drop(["IS_SUCCESSFUL"],axis=1)

XTrain, XTest, yTrain, yTest = train_test_split(x, y, random_state=0, test_size=0.25)

# Instantiate the classifier
gnb = GaussianNB()

gnb.fit(XTrain,yTrain)

y_pred = gnb.predict(XTest)
y_pred_train = gnb.predict(XTrain)

# Print results
print('Accuracy on Train data= ', metrics.accuracy_score(y_true = yTrain, y_pred = y_pred_train))
print('Accuracy on test data= ', metrics.accuracy_score(y_true = yTest, y_pred = y_pred))
Accuracy on Train data=  0.816780045351474
Accuracy on test data=  0.8250226654578422
gnb.class_prior_

array([0.87951625, 0.12048375])

We can see that only about 12% SUCCESSFUL..

In conclusion, we have seen that the algorithms that have brought us the best results are Decision Trees and Logistic Regression with about 89% accuracy.

Credit: part of the code was taken from Data Science course Campus IL

df.to_csv('Data/companies_df/final_df.csv', index=False)