Nashville Housing Data Cleaning - MSSQL

Nashville-Housing-Data-Cleaning

Kevin Folkes
2022-11-04

midtown-nashville-real-estate.jpg



SELECT
  *
FROM
  PortfolioProject.dbo.Nashville_housing;
image.png


  • Standardize Date Format
ALTER TABLE PortfolioProject.dbo.Nashville_housing Add SaleDateConverted DATE;
UPDATE PortfolioProject.dbo.Nashville_housing
SET
  SaleDateConverted = CONVERT(DATE, SaleDate);
SELECT
  SaleDate,
  SaleDateConverted
FROM
  PortfolioProject.dbo.Nashville_housing;


image.png


Populate the missing address in the Property Address field


SELECT
  PropertyAddress
FROM
  PortfolioProject.dbo.Nashville_housing
WHERE
  PropertyAddress IS NULL;
image.png


  • Self joined table to populate missing data in the property address
Select A.ParcelID, A.PropertyAddress, B.ParcelID, B.PropertyAddress, ISNULL(A.PropertyAddress, B.PropertyAddress)
From PortfolioProject.dbo.Nashville_housing as A
JOIN PortfolioProject.dbo.Nashville_housing as B
    ON A.ParcelID = B.ParcelID
    AND A.[UniqueID ] <> B.[UniqueID ]
Where A.PropertyAddress is null;


image.png


  • Now that it’s verified that the addresses can be correctly populated, the table will now be updated.
Update A
SET PropertyAddress = ISNULL(A.PropertyAddress, B.PropertyAddress)
From PortfolioProject.dbo.Nashville_housing as A
JOIN PortfolioProject.dbo.Nashville_housing as B
    ON A.ParcelID = B.ParcelID
    AND A.[UniqueID ] <> B.[UniqueID ]
Where A.PropertyAddress is null


SELECT
  PropertyAddress
FROM
  PortfolioProject.dbo.Nashville_housing
WHERE
  PropertyAddress IS NULL;


  • 0 Rows were returned as all the addresses are now populated.

Breaking out Address into Individual Columns (Address, City, State).

  • Using the SUBSTRING and CHARINDEX method to slip the property address by delimiter.


SELECT
  SUBSTRING(
    PropertyAddress,
    1,
    CHARINDEX (',', PropertyAddress) -1
  ) AS Address,
  SUBSTRING(
    PropertyAddress,
    CHARINDEX (',', PropertyAddress) + 1,
    LEN (PropertyAddress)
  ) AS Address
FROM
  PortfolioProject.dbo.Nashville_housing;


image.png


  • Updating tables with split address columns


ALTER TABLE Nashville_housing Add PropertySplitAddress Nvarchar (255);
          
UPDATE Nashville_housing
SET
  PropertySplitAddress = SUBSTRING(
    PropertyAddress,
    1,
    CHARINDEX (',', PropertyAddress) -1
  );


ALTER TABLE Nashville_housing Add PropertySplitAddress Nvarchar (255);
          
UPDATE Nashville_housing
SET
  PropertySplitAddress = SUBSTRING(
    PropertyAddress,
    1,
    CHARINDEX (',', PropertyAddress) -1
  );


  • Confirming that new columns were added and that the data is correct.
SELECT 
  PropertySplitAddress,
  PropertySplitCity
FROM
  PortfolioProject.dbo.Nashville_housing;


image.png


Using PARSENAME method, as PARSENAME only works with period/full stops ‘.’, the commas were replaced.

SELECT
  OwnerAddress
FROM
  PortfolioProject.dbo.Nashville_housing;


image.png


ALTER TABLE PortfolioProject.dbo.Nashville_housing Add OnwerSplitAddress Nvarchar (255);
          
UPDATE PortfolioProject.dbo.Nashville_housing
SET
  OnwerSplitAddress = PARSENAME (REPLACE (OwnerAddress, ',', '.'), 3);


ALTER TABLE PortfolioProject.dbo.Nashville_housing Add OnwerSplitCity Nvarchar (255);
          
UPDATE PortfolioProject.dbo.Nashville_housing
SET
  OnwerSplitCity = PARSENAME (REPLACE (OwnerAddress, ',', '.'), 2);


ALTER TABLE PortfolioProject.dbo.Nashville_housing Add OnwerSplitState Nvarchar (255);
          
UPDATE PortfolioProject.dbo.Nashville_housing
SET
  OnwerSplitState = PARSENAME (REPLACE (OwnerAddress, ',', '.'), 1);


  • Verifying that the new columns were added and contain the correct split address data.
SELECT
  OnwerSplitAddress,
  OnwerSplitCity,
  OnwerSplitState
FROM
  PortfolioProject.dbo.Nashville_housing;


image.png

Change “Y” and “N” to “Yes” and “No” in “Sold as Vacant” field

SELECT DISTINCT
  (SoldAsVacant),
  COUNT(SoldAsVacant)
FROM
  PortfolioProject.dbo.Nashville_housing
GROUP BY
  SoldAsVacant
ORDER BY
  2;


image.png


UPDATE Nashville_housing
SET
  SoldAsVacant = CASE
    WHEN SoldAsVacant = 'Y' THEN 'Yes'
    WHEN SoldAsVacant = 'N' THEN 'No'
    ELSE SoldAsVacant
  END;


SELECT DISTINCT
  (SoldAsVacant),
  COUNT(SoldAsVacant)
FROM
  PortfolioProject.dbo.Nashville_housing
GROUP BY
  SoldAsVacant
ORDER BY
  2;


image.png


Remove Duplicates

WITH RowNumCTE As (
Select  *, 
    ROW_NUMBER() OVER(
    PARTITION BY ParcelID,
                PropertyAddress,
                SalePrice,
                SaleDate,
                LegalReference
                ORDER BY
                    UniqueID
                    ) as row_num
                
From PortfolioProject.dbo.Nashville_housing
--Order by ParcelID
)
SELECT COUNT(*) as number_of_duplicated_rows
FROM  RowNumCTE
WHERE row_num > 1;

  • 103 duplicated rows have been identified
image.png


  • Usually, the imported data is not modified in such a way and instead all the data remaines with the manipulations being done on a view/temp table.
WITH RowNumCTE As (
SELECT  *, 
    ROW_NUMBER() OVER(
    PARTITION BY ParcelID,
                PropertyAddress,
                SalePrice,
                SaleDate,
                LegalReference
                ORDER BY
                    UniqueID
                    ) as row_num
                
From PortfolioProject.dbo.Nashville_housing
--Order by ParcelID
)

DELETE 
From RowNumCTE
Where row_num > 1;