Exploratory data analysis on video game sales.
The dataset is sourced from kaggle which contains a list of video games with sales from 1980 to 2020. The dataset has 11 columns and 16598 records. I decided to drop records with null values in order to make the result of the analysis more accurate. In addition, I split the dataset into two related tables based on the principle of data normalization - the "game_info" table contains the basic information of the game, and the "game_sales" table contains the sales records of the game in various regions.
Since we will need to join both tables multiple times to perform analysis, I would create a temporary table for resusiblity purpose. Using temporary can make the query a lot shorter and also speed up the runtime to increase efficiency.
CREATE TEMPORARY TABLE game_sales_temp
SELECT
i.game_id,
name,
platform,
year,
genre,
publisher,
na_sales,
eu_sales,
jp_sales,
other_sales,
global_sales
FROM game_info i
JOIN game_sales s
ON i.game_id = s.game_id;
SELECT * FROM game_sales_temp;
Let’s take a look of the completed dataset
- game_id- unique game id
- name - The games name
- platform - Platform of the games release (i.e. PC,PS4, etc.)
- year - Year of the game's release
- genre - Genre of the game
- publisher - Publisher of the game
- na_sales - Sales in North America (in millions)
- eu_sales - Sales in Europe (in millions)
- jp_sales - Sales in Japan (in millions)
- other_sales - Sales in the rest of the world (in millions)
- global_sales - Total worldwide sales.
SELECT
platform,
COUNT(name) AS num_games
FROM game_sales_temp
GROUP BY 1
ORDER BY 2 DESC
LIMIT 3;
- DS, PS2 and PS3 are three dominant platforms globally
2. A specific game can be played on multiple platforms, find games which were played on more than 5 platforms
SELECT
name,
COUNT(platform) AS num_platform
FROM game_sales_temp
GROUP BY 1
HAVING COUNT(platform) > 5
ORDER BY 2 DESC;
- There are 133 games were played on more than 5 platforms
- 'Need for Speed: Most Wanted' is the game that played on most platforms
WITH cte AS(
SELECT
platform,
genre,
ROUND(SUM(global_sales),2) AS total_sales,
DENSE_RANK() OVER (PARTITION BY platform ORDER BY ROUND(SUM(global_sales),2) DESC) AS rank_sales
FROM game_sales_temp
GROUP BY 1,2)
SELECT
platform,
genre,
total_sales
FROM cte
WHERE rank_sales = 1
ORDER BY total_sales DESC;
WITH cte AS (
SELECT
genre,
name,
ROUND(SUM(global_sales),2) AS total_sales,
ROW_NUMBER() OVER (PARTITION BY genre ORDER BY SUM(global_sales) DESC) AS sales_rank
FROM game_sales_temp
GROUP BY 1,2
)
SELECT * FROM cte WHERE sales_rank <= 3;
SELECT
genre,
ROUND(SUM(global_sales),2) AS total_sales,
ROUND((SUM(global_sales)/(SELECT SUM(global_sales) FROM game_sales_completed))*100,2) AS 'sale%'
FROM game_sales_temp
GROUP BY 1
ORDER BY 3 DESC;
SELECT
genre,
ROUND(SUM(NA_Sales),2) AS North_America_sales,
ROUND(SUM(EU_Sales),2) AS Europe_sales,
ROUND(SUM(JP_Sales),2) AS Japan_sales
FROM game_sales_temp
GROUP BY 1;
-- ORDER BY 2 DESC #most popular game in North America
-- ORDER BY 3 DESC #most popular game in North Europe
-- ORDER BY 4 DESC #most popular game in North Japan;
- Action game is the most popular game type in both North America and Europe
- Role-playing game is the most popular game in Japan
WITH cte AS
(SELECT
publisher,
ROUND(SUM(global_sales),2) AS total_sales
FROM game_info i
LEFT JOIN game_sales s
ON i.game_id = s.game_id
GROUP BY 1
ORDER BY 2 DESC)
SELECT
publisher,
total_sales
FROM cte
WHERE total_sales > (SELECT AVG(total_sales) AS avg_sales FROM cte);
- There are total 39 publishers have total global sales greater than the average global sales of all publishers.
WITH cte AS(
SELECT
genre,
year,
COUNT(*) AS num_games
FROM game_info i
LEFT JOIN game_sales s
ON i.game_id = s.game_id
GROUP BY 1,2
ORDER BY 1,2)
SELECT
genre,
year,
num_games
FROM cte
WHERE (genre, num_games) IN (SELECT genre, MAX(num_games) AS most_releases FROM cte
GROUP BY 1)
ORDER BY year;
- We can see all genres have most sales during the period of 2003 - 2009.
9. Find the year, number of games and their names that sold more than 15 million worldwide. Sort the game name lexicographically.
SELECT
year,
COUNT(DISTINCT name) AS num_movie,
GROUP_CONCAT(DISTINCT name ORDER BY name SEPARATOR ', ') AS 'movie(s)'
FROM game_sales_temp
WHERE global_sales > 15
GROUP BY 1
ORDER BY 1;
SELECT
ROUND(SUM(CASE WHEN year LIKE '198%' THEN global_sales ELSE 0 END),2) AS num_80s_games,
ROUND(SUM(CASE WHEN year LIKE '199%' THEN global_sales ELSE 0 END),2) AS num_90s_games,
ROUND(SUM(CASE WHEN year LIKE '200%' THEN global_sales ELSE 0 END),2) AS num_00s_games,
ROUND(SUM(CASE WHEN year LIKE '201%' THEN global_sales ELSE 0 END),2) AS num_10s_games
FROM game_sales_temp;
- Consecutive years from 2002 - 2011 are the top 10 years that have the most sales, peaked in 2008.
SELECT * FROM game_sales_completed;
SELECT
ROUND(SUM(NA_Sales),2) AS na_sales,
ROUND((SUM(NA_Sales)/SUM(global_sales))*100,2) AS 'na_sales%',
ROUND(SUM(EU_Sales),2) AS eu_sales,
ROUND((SUM(EU_Sales)/SUM(global_sales))*100,2) AS 'eu_sales_ratio%',
ROUND(SUM(JP_Sales),2) AS jp_sales,
ROUND((SUM(JP_Sales)/SUM(global_sales))*100,2) AS 'jp_sales%',
ROUND(SUM(Other_Sales),2) AS other_sales,
ROUND((SUM(Other_Sales)/SUM(global_sales))*100,2) AS 'other_sales%'
FROM game_sales_temp;
- North America has the half of the global sales