US Household Income Data Cleaning - MySQL

Us-Household-Income-Data-Cleaning
image.png


  • Explore Dataset
  • Tidy and Clean data so it can be used for further analysis.


Review Tables and Data Types

SELECT
  *
FROM
  us_project.us_household_income;
image.png


SELECT
  *
FROM
  us_project.us_household_income_statistics;
image.png


SHOW COLUMNS
FROM
  us_project.us_household_income;
image.png


SHOW COLUMNS
FROM
  us_project.us_household_income_statistics;
image.png


Identifying and Removing Duplicates

SELECT
  id,
  COUNT(id) AS id_count
FROM
  us_project.us_household_income
GROUP BY
  id
HAVING
  id_count > 1;
image.png


--Removing identified duplicates from last step: 
DELETE FROM us_project.us_household_income
WHERE
  row_id IN (
    SELECT
      row_id
    FROM
      (
        SELECT
          row_id,
          id,
          ROW_NUMBER() OVER (
            PARTITION BY
              id
            ORDER BY
              id
          ) AS Row_Num
        FROM
          us_project.us_household_income
      ) AS Row_Table
    WHERE
      Row_Num > 1
  );
  • Verifying if duplicates were removed
SELECT
  id,
  COUNT(id) AS id_count
FROM
  us_project.us_household_income
GROUP BY
  id
HAVING
  id_count > 1;
  • 0 rows were returned, therefore duplicates were successfully removed.

Standardizing Data in columns

SELECT DISTINCT
  State_Name
FROM
  us_project.us_household_income_staging;
image.png


UPDATE us_project.us_household_income
SET
  State_Name = 'Alabama'
WHERE
  State_name = 'alabama';
UPDATE us_project.us_household_income
SET
  State_Name = 'Georgia'
WHERE
  State_name = 'georia';
image.png


SELECT
  *
FROM
  us_project.us_household_income
WHERE
  Place = '';
image.png


  • There’s a missing entry for the Place field, we’ll fill in the gap where the County and City is the same.
SELECT
  *
FROM
  us_project.us_household_income
WHERE
  County = 'Autauga County'
  AND City = 'Vinemont';
image.png


UPDATE us_project.us_household_income
SET
  Place = 'Autaugaville'
WHERE
  County = 'Autauga County'
  AND City = 'Vinemont';
SELECT
  *
FROM
  us_project.us_household_income
WHERE
  County = 'Autauga County'
  AND City = 'Vinemont';
image.png


  • Now the Place column shows ‘Autaugaville’ for row_id 32
SELECT
  Type,
  COUNT(Type)
FROM
  us_project.us_household_income
GROUP BY
  Type;
image.png


  • 2 Entries for boroughs (‘Borough’ & ‘Boroughs’)
SELECT
  Type,
  COUNT(Type)
FROM
  us_project.us_household_income
GROUP BY
            Type;;
image.png


UPDATE us_project.us_household_income
SET
  Type = 'Borough'
WHERE
  Type = 'Boroughs';
image.png


  • Now only Borough exists