The Algorithms logoThe Algorithms
About

Video Games Analysis

K
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('E:/videogame/Video_Games_Sales_as_at_22_Dec_2016.csv')
df.head(20)
Name Platform Year_of_Release Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
0 Wii Sports Wii 2006.0 Sports Nintendo 41.36 28.96 3.77 8.45 82.53 76.0 51.0 8 322.0 Nintendo E
1 Super Mario Bros. NES 1985.0 Platform Nintendo 29.08 3.58 6.81 0.77 40.24 NaN NaN NaN NaN NaN NaN
2 Mario Kart Wii Wii 2008.0 Racing Nintendo 15.68 12.76 3.79 3.29 35.52 82.0 73.0 8.3 709.0 Nintendo E
3 Wii Sports Resort Wii 2009.0 Sports Nintendo 15.61 10.93 3.28 2.95 32.77 80.0 73.0 8 192.0 Nintendo E
4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo 11.27 8.89 10.22 1.00 31.37 NaN NaN NaN NaN NaN NaN
5 Tetris GB 1989.0 Puzzle Nintendo 23.20 2.26 4.22 0.58 30.26 NaN NaN NaN NaN NaN NaN
6 New Super Mario Bros. DS 2006.0 Platform Nintendo 11.28 9.14 6.50 2.88 29.80 89.0 65.0 8.5 431.0 Nintendo E
7 Wii Play Wii 2006.0 Misc Nintendo 13.96 9.18 2.93 2.84 28.92 58.0 41.0 6.6 129.0 Nintendo E
8 New Super Mario Bros. Wii Wii 2009.0 Platform Nintendo 14.44 6.94 4.70 2.24 28.32 87.0 80.0 8.4 594.0 Nintendo E
9 Duck Hunt NES 1984.0 Shooter Nintendo 26.93 0.63 0.28 0.47 28.31 NaN NaN NaN NaN NaN NaN
10 Nintendogs DS 2005.0 Simulation Nintendo 9.05 10.95 1.93 2.74 24.67 NaN NaN NaN NaN NaN NaN
11 Mario Kart DS DS 2005.0 Racing Nintendo 9.71 7.47 4.13 1.90 23.21 91.0 64.0 8.6 464.0 Nintendo E
12 Pokemon Gold/Pokemon Silver GB 1999.0 Role-Playing Nintendo 9.00 6.18 7.20 0.71 23.10 NaN NaN NaN NaN NaN NaN
13 Wii Fit Wii 2007.0 Sports Nintendo 8.92 8.03 3.60 2.15 22.70 80.0 63.0 7.7 146.0 Nintendo E
14 Kinect Adventures! X360 2010.0 Misc Microsoft Game Studios 15.00 4.89 0.24 1.69 21.81 61.0 45.0 6.3 106.0 Good Science Studio E
15 Wii Fit Plus Wii 2009.0 Sports Nintendo 9.01 8.49 2.53 1.77 21.79 80.0 33.0 7.4 52.0 Nintendo E
16 Grand Theft Auto V PS3 2013.0 Action Take-Two Interactive 7.02 9.09 0.98 3.96 21.04 97.0 50.0 8.2 3994.0 Rockstar North M
17 Grand Theft Auto: San Andreas PS2 2004.0 Action Take-Two Interactive 9.43 0.40 0.41 10.57 20.81 95.0 80.0 9 1588.0 Rockstar North M
18 Super Mario World SNES 1990.0 Platform Nintendo 12.78 3.75 3.54 0.55 20.61 NaN NaN NaN NaN NaN NaN
19 Brain Age: Train Your Brain in Minutes a Day DS 2005.0 Misc Nintendo 4.74 9.20 4.16 2.04 20.15 77.0 58.0 7.9 50.0 Nintendo E
df.shape #16719 rows, 16 columns. Get to know your data size for efficient analysis
(16719, 16)
df.columns  #get all the columns
Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating'],
      dtype='object')
#rename the columns to avoid confusion.
df.rename(columns={'EU_Sales' : 'Europe_Sales', 'NA_Sales' : 'N.America_Sales', 'JP_Sales' : 'Japan_Sales'},  inplace=True)
df.head(10) # take a look at the data.I have chosen first 7 rows
Name Platform Year_of_Release Genre Publisher N.America_Sales Europe_Sales Japan_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
0 Wii Sports Wii 2006.0 Sports Nintendo 41.36 28.96 3.77 8.45 82.53 76.0 51.0 8 322.0 Nintendo E
1 Super Mario Bros. NES 1985.0 Platform Nintendo 29.08 3.58 6.81 0.77 40.24 NaN NaN NaN NaN NaN NaN
2 Mario Kart Wii Wii 2008.0 Racing Nintendo 15.68 12.76 3.79 3.29 35.52 82.0 73.0 8.3 709.0 Nintendo E
3 Wii Sports Resort Wii 2009.0 Sports Nintendo 15.61 10.93 3.28 2.95 32.77 80.0 73.0 8 192.0 Nintendo E
4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo 11.27 8.89 10.22 1.00 31.37 NaN NaN NaN NaN NaN NaN
5 Tetris GB 1989.0 Puzzle Nintendo 23.20 2.26 4.22 0.58 30.26 NaN NaN NaN NaN NaN NaN
6 New Super Mario Bros. DS 2006.0 Platform Nintendo 11.28 9.14 6.50 2.88 29.80 89.0 65.0 8.5 431.0 Nintendo E
7 Wii Play Wii 2006.0 Misc Nintendo 13.96 9.18 2.93 2.84 28.92 58.0 41.0 6.6 129.0 Nintendo E
8 New Super Mario Bros. Wii Wii 2009.0 Platform Nintendo 14.44 6.94 4.70 2.24 28.32 87.0 80.0 8.4 594.0 Nintendo E
9 Duck Hunt NES 1984.0 Shooter Nintendo 26.93 0.63 0.28 0.47 28.31 NaN NaN NaN NaN NaN NaN
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
Name               16717 non-null object
Platform           16719 non-null object
Year_of_Release    16450 non-null float64
Genre              16717 non-null object
Publisher          16665 non-null object
N.America_Sales    16719 non-null float64
Europe_Sales       16719 non-null float64
Japan_Sales        16719 non-null float64
Other_Sales        16719 non-null float64
Global_Sales       16719 non-null float64
Critic_Score       8137 non-null float64
Critic_Count       8137 non-null float64
User_Score         10015 non-null object
User_Count         7590 non-null float64
Developer          10096 non-null object
Rating             9950 non-null object
dtypes: float64(9), object(7)
memory usage: 2.0+ MB
df.describe() #very important step in data analysis. Gives you a clear idea of sales and numerical figures
Year_of_Release N.America_Sales Europe_Sales Japan_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Count
count 16450.000000 16719.000000 16719.000000 16719.000000 16719.000000 16719.000000 8137.000000 8137.000000 7590.000000
mean 2006.487356 0.263330 0.145025 0.077602 0.047332 0.533543 68.967679 26.360821 162.229908
std 5.878995 0.813514 0.503283 0.308818 0.186710 1.547935 13.938165 18.980495 561.282326
min 1980.000000 0.000000 0.000000 0.000000 0.000000 0.010000 13.000000 3.000000 4.000000
25% 2003.000000 0.000000 0.000000 0.000000 0.000000 0.060000 60.000000 12.000000 10.000000
50% 2007.000000 0.080000 0.020000 0.000000 0.010000 0.170000 71.000000 21.000000 24.000000
75% 2010.000000 0.240000 0.110000 0.040000 0.030000 0.470000 79.000000 36.000000 81.000000
max 2020.000000 41.360000 28.960000 10.220000 10.570000 82.530000 98.000000 113.000000 10665.000000
#filter out the game with most Global sales
filterr = (df['Global_Sales']==df['Global_Sales'].max())
df['Name'][filterr]  #Wii Sports rules
0    Wii Sports
Name: Name, dtype: object
df['Publisher'].value_counts()
Electronic Arts                 1356
Activision                       985
Namco Bandai Games               939
Ubisoft                          933
Konami Digital Entertainment     834
                                ... 
Games Workshop                     1
Simon & Schuster Interactive       1
Havas Interactive                  1
Fortyfive                          1
PM Studios                         1
Name: Publisher, Length: 581, dtype: int64
Publisher_get = df.groupby(['Publisher'])
Publisher_get.get_group('Electronic Arts')
#For the sports lovers. the data below gives you look at all the games by EA_Sports
Name Platform Year_of_Release Genre Publisher N.America_Sales Europe_Sales Japan_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
77 FIFA 16 PS4 2015.0 Sports Electronic Arts 1.12 6.12 0.06 1.28 8.57 82.0 42.0 4.3 896.0 EA Sports E
81 FIFA Soccer 13 PS3 2012.0 Action Electronic Arts 1.06 5.01 0.13 1.97 8.16 88.0 37.0 6.6 348.0 Electronic Arts E
85 The Sims 3 PC 2009.0 Simulation Electronic Arts 0.99 6.42 0.00 0.60 8.01 86.0 75.0 7.6 886.0 The Sims Studio T
87 Star Wars Battlefront (2015) PS4 2015.0 Shooter Electronic Arts 2.99 3.49 0.22 1.28 7.98 NaN NaN NaN NaN NaN NaN
94 FIFA 17 PS4 2016.0 Sports Electronic Arts 0.66 5.75 0.08 1.11 7.59 85.0 41.0 5 398.0 EA Sports, EA Vancouver E
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16380 Tiger Woods PGA Tour 2005 PC 2004.0 Sports Electronic Arts 0.00 0.01 0.00 0.00 0.01 91.0 16.0 4.5 39.0 Headgate E
16489 Poker for Dummies PC 2008.0 Misc Electronic Arts 0.00 0.01 0.00 0.00 0.01 NaN NaN tbd NaN Electronic Arts T
16491 Command & Conquer Renegade PC 2002.0 Shooter Electronic Arts 0.00 0.01 0.00 0.00 0.01 NaN NaN NaN NaN NaN NaN
16510 The Godfather (JP sales) X360 2006.0 Action Electronic Arts 0.00 0.00 0.01 0.00 0.01 NaN NaN NaN NaN NaN NaN
16670 Psychic Detective PS 1995.0 Adventure Electronic Arts 0.01 0.00 0.00 0.00 0.01 NaN NaN NaN NaN NaN NaN

1356 rows × 16 columns

filt = (df['Japan_Sales']==df['Japan_Sales'].max())
df['Publisher'][filt]  #Nintendo company rules iN japan 
#In other regions Wii game has the highest sales
4    Nintendo
Name: Publisher, dtype: object
print(df['Japan_Sales'].isna().sum())
print(df['Publisher'].isna().sum()) #Publisher has a lot of empty values lets clean it up
0
54
type(np.nan) #Nan values create problems as they are float and rest of values are integers
float
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(25,25))
plt.plot(df['Name'].head(10), df['N.America_Sales'].head(10), color='red', label='N.America_Sales')
plt.plot(df['Name'].head(10), df['Japan_Sales'].head(10), color ='pink', label='Japan_Sales')
plt.plot(df['Name'].head(10), df['Europe_Sales'].head(10), color='yellow', label='Europe_Sales')
plt.tight_layout()
plt.legend()
plt.xlabel('Famous Games')
plt.ylabel('Sales')
plt.title('Popularity of Famous Games')
Text(0.5, 1, 'Popularity of Famous Games')
df.Publisher = df.Publisher.fillna('')
df.Publisher.isna().sum() #Cleared Nan values with empty string
0
Nin = (df['Publisher']=='Nintendo')
EA = (df['Publisher']=='Electronic Arts')
#filtering out EA sports and Nintendo to compare which compay dominates
#Nintendo sales across various regions
print(df['Japan_Sales'][Nin].sum())
print(df['Europe_Sales'][Nin].sum())
print(df['N.America_Sales'][Nin].sum())
print(df['Global_Sales'][Nin].sum())
458.15
419.01
816.9700000000001
1788.81
#EA sales across various regions
print(df['Japan_Sales'][EA].sum())
print(df['Europe_Sales'][EA].sum())
print(df['N.America_Sales'][EA].sum())
print(df['Global_Sales'][EA].sum())

14.350000000000001
373.90999999999997
599.5
1116.96
#Lets analyse the Genre of Video Games which are popular
df['Genre'].value_counts()
Action          3370
Sports          2348
Misc            1750
Role-Playing    1500
Shooter         1323
Adventure       1303
Racing          1249
Platform         888
Simulation       874
Fighting         849
Strategy         683
Puzzle           580
Name: Genre, dtype: int64
df['Genre'].fillna('Unknown', inplace = True) #Nan values replaced with "Unknown string"
labels = ['Action', 'Sports', 'Misc', 'Role-Playing', 'Shooter', 'Adventure', 'Racing', 'Platform', 'Simulation', 'Fighting', 'Strategy', 'Puzzle', 'Unknown']            
#Most popular games produced are of Action Genre followed by Sports
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(11,11))
plt.pie(list(df['Genre'].value_counts()), autopct='%1.1f%%', labels=labels, wedgeprops={'edgecolor':'black'})
plt.title('Most Popular Genres in Video Games')
plt.tight_layout()
plt.show()
t  = sns.pairplot(df)
t 
#analyse the paiplots
C:\Users\ACER\Anaconda3\lib\site-packages\numpy\lib\histograms.py:824: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
C:\Users\ACER\Anaconda3\lib\site-packages\numpy\lib\histograms.py:825: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)
<seaborn.axisgrid.PairGrid at 0x270b356eac8>
df['Critic_Score']
0        76.0
1         NaN
2        82.0
3        80.0
4         NaN
         ... 
16714     NaN
16715     NaN
16716     NaN
16717     NaN
16718     NaN
Name: Critic_Score, Length: 16719, dtype: float64
df['Critic_Score'].isna().sum() #lots of values in Critic score
8582
df['Critic_Score'].median()
71.0
#Replace Nan values in Critic_score with median Values
df['Critic_Score'].fillna(71, inplace=True)
df['Critic_Score'].max()
98.0
filtera = (df['Critic_Score']==df['Critic_Score'].max())
df['Name'][filtera] #best games according to critic scores
51           Grand Theft Auto IV
57           Grand Theft Auto IV
227     Tony Hawk's Pro Skater 2
5350                 SoulCalibur
Name: Name, dtype: object
"""So GTA, Tony Hawk's Pro Skater 2 and SoulCalibur dominate when it comes to Critics ratings.So these are the games you should
loook out for next time you go game shopping"""
"So GTA, Tony Hawk's Pro Skater 2 and SoulCalibur dominate when it comes to Critics ratings.So these are the games you should\nloook out for next time you go game shopping"
df['Rating'].value_counts()
E       3991
T       2961
M       1563
E10+    1420
EC         8
RP         3
K-A        3
AO         1
Name: Rating, dtype: int64
df['Rating'].isna().sum() #Nan values which create the problems
6769
games_with_no_rating  = df['Rating'].isna()
df['Name'][games_with_no_rating] 
#the games with no ratings are mostly normal games which everyone can play and not adult games
1                    Super Mario Bros.
4             Pokemon Red/Pokemon Blue
5                               Tetris
9                            Duck Hunt
10                          Nintendogs
                     ...              
16714    Samurai Warriors: Sanada Maru
16715                 LMA Manager 2007
16716          Haitaka no Psychedelica
16717                 Spirits & Spells
16718              Winning Post 8 2016
Name: Name, Length: 6769, dtype: object
#Replace Nan values with E Rating i.e Everyone can play it
df['Rating'].fillna('E', inplace=True)
df['Rating'].value_counts()
list(df['Rating'].value_counts())
[10760, 2961, 1563, 1420, 8, 3, 3, 1]
from collections import Counter
a = list(df['Rating'])
letter_counts = Counter(a)
d = pd.DataFrame.from_dict(letter_counts, orient='index')


d.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x270bb67ce08>
#In the above Barplot E rating dominates which signifies most games made in industries are for people of all ages.
df
Name Platform Year_of_Release Genre Publisher N.America_Sales Europe_Sales Japan_Sales Other_Sales Global_Sales Critic_Score Critic_Count User_Score User_Count Developer Rating
0 Wii Sports Wii 2006.0 Sports Nintendo 41.36 28.96 3.77 8.45 82.53 76.0 51.0 8 322.0 Nintendo E
1 Super Mario Bros. NES 1985.0 Platform Nintendo 29.08 3.58 6.81 0.77 40.24 71.0 NaN NaN NaN NaN E
2 Mario Kart Wii Wii 2008.0 Racing Nintendo 15.68 12.76 3.79 3.29 35.52 82.0 73.0 8.3 709.0 Nintendo E
3 Wii Sports Resort Wii 2009.0 Sports Nintendo 15.61 10.93 3.28 2.95 32.77 80.0 73.0 8 192.0 Nintendo E
4 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo 11.27 8.89 10.22 1.00 31.37 71.0 NaN NaN NaN NaN E
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
16714 Samurai Warriors: Sanada Maru PS3 2016.0 Action Tecmo Koei 0.00 0.00 0.01 0.00 0.01 71.0 NaN NaN NaN NaN E
16715 LMA Manager 2007 X360 2006.0 Sports Codemasters 0.00 0.01 0.00 0.00 0.01 71.0 NaN NaN NaN NaN E
16716 Haitaka no Psychedelica PSV 2016.0 Adventure Idea Factory 0.00 0.00 0.01 0.00 0.01 71.0 NaN NaN NaN NaN E
16717 Spirits & Spells GBA 2003.0 Platform Wanadoo 0.01 0.00 0.00 0.00 0.01 71.0 NaN NaN NaN NaN E
16718 Winning Post 8 2016 PSV 2016.0 Simulation Tecmo Koei 0.00 0.00 0.01 0.00 0.01 71.0 NaN NaN NaN NaN E

16719 rows × 16 columns

df['Year_of_Release'] = pd.to_datetime(df['Year_of_Release'], format='%Y')
df['Year_of_Release'].min()
Timestamp('1980-01-01 00:00:00')
df['Year_of_Release'].max()
Timestamp('2020-01-01 00:00:00')