- Explore Dataset
- Tidy and Clean data so it can be used for further analysis.
Review Tables and Data Types
SELECT
*
FROM
us_project.us_household_income;
SELECT
*
FROM
us_project.us_household_income_statistics;
SHOW COLUMNS
FROM
us_project.us_household_income;
SHOW COLUMNS
FROM
us_project.us_household_income_statistics;
Identifying and Removing Duplicates
SELECT
id,
COUNT(id) AS id_count
FROM
us_project.us_household_income
GROUP BY
id
HAVING
id_count > 1;
--Removing identified duplicates from last step:
DELETE FROM us_project.us_household_income
WHERE
row_id IN (
SELECT
row_id
FROM
(
SELECT
row_id,
id,
ROW_NUMBER() OVER (
PARTITION BY
id
ORDER BY
id
) AS Row_Num
FROM
us_project.us_household_income
) AS Row_Table
WHERE
Row_Num > 1
);
- Verifying if duplicates were removed
SELECT
id,
COUNT(id) AS id_count
FROM
us_project.us_household_income
GROUP BY
id
HAVING
id_count > 1;
- 0 rows were returned, therefore duplicates were successfully
removed.
Standardizing Data in columns
SELECT DISTINCT
State_Name
FROM
us_project.us_household_income_staging;
UPDATE us_project.us_household_income
SET
State_Name = 'Alabama'
WHERE
State_name = 'alabama';
UPDATE us_project.us_household_income
SET
State_Name = 'Georgia'
WHERE
State_name = 'georia';
SELECT
*
FROM
us_project.us_household_income
WHERE
Place = '';
- There’s a missing entry for the Place field, we’ll fill in the gap
where the County and City is the same.
SELECT
*
FROM
us_project.us_household_income
WHERE
County = 'Autauga County'
AND City = 'Vinemont';
UPDATE us_project.us_household_income
SET
Place = 'Autaugaville'
WHERE
County = 'Autauga County'
AND City = 'Vinemont';
SELECT
*
FROM
us_project.us_household_income
WHERE
County = 'Autauga County'
AND City = 'Vinemont';
- Now the Place column shows ‘Autaugaville’ for row_id 32
SELECT
Type,
COUNT(Type)
FROM
us_project.us_household_income
GROUP BY
Type;
- 2 Entries for boroughs (‘Borough’ & ‘Boroughs’)
SELECT
Type,
COUNT(Type)
FROM
us_project.us_household_income
GROUP BY
Type;;
UPDATE us_project.us_household_income
SET
Type = 'Borough'
WHERE
Type = 'Boroughs';