Nashville Housing Data Cleaning - MSSQL
Kevin Folkes
2022-11-04

SELECT
*
FROM
PortfolioProject.dbo.Nashville_housing;

- Standardize Date Format
ALTER TABLE PortfolioProject.dbo.Nashville_housing Add SaleDateConverted DATE;
UPDATE PortfolioProject.dbo.Nashville_housing
SET
SaleDateConverted = CONVERT(DATE, SaleDate);
SELECT
SaleDate,
SaleDateConverted
FROM
PortfolioProject.dbo.Nashville_housing;

Populate the missing address in the Property Address field
SELECT
PropertyAddress
FROM
PortfolioProject.dbo.Nashville_housing
WHERE
PropertyAddress IS NULL;

- Self joined table to populate missing data in the property address
Select A.ParcelID, A.PropertyAddress, B.ParcelID, B.PropertyAddress, ISNULL(A.PropertyAddress, B.PropertyAddress)
From PortfolioProject.dbo.Nashville_housing as A
JOIN PortfolioProject.dbo.Nashville_housing as B
ON A.ParcelID = B.ParcelID
AND A.[UniqueID ] <> B.[UniqueID ]
Where A.PropertyAddress is null;

- Now that it’s verified that the addresses can be correctly populated, the table will now be updated.
Update A
SET PropertyAddress = ISNULL(A.PropertyAddress, B.PropertyAddress)
From PortfolioProject.dbo.Nashville_housing as A
JOIN PortfolioProject.dbo.Nashville_housing as B
ON A.ParcelID = B.ParcelID
AND A.[UniqueID ] <> B.[UniqueID ]
Where A.PropertyAddress is null
SELECT
PropertyAddress
FROM
PortfolioProject.dbo.Nashville_housing
WHERE
PropertyAddress IS NULL;
- 0 Rows were returned as all the addresses are now populated.
Breaking out Address into Individual Columns (Address, City, State).
- Using the SUBSTRING and CHARINDEX method to slip the property address by delimiter.
SELECT
SUBSTRING(
PropertyAddress,
1,
CHARINDEX (',', PropertyAddress) -1
) AS Address,
SUBSTRING(
PropertyAddress,
CHARINDEX (',', PropertyAddress) + 1,
LEN (PropertyAddress)
) AS Address
FROM
PortfolioProject.dbo.Nashville_housing;

- Updating tables with split address columns
ALTER TABLE Nashville_housing Add PropertySplitAddress Nvarchar (255);
UPDATE Nashville_housing
SET
PropertySplitAddress = SUBSTRING(
PropertyAddress,
1,
CHARINDEX (',', PropertyAddress) -1
);
ALTER TABLE Nashville_housing Add PropertySplitAddress Nvarchar (255);
UPDATE Nashville_housing
SET
PropertySplitAddress = SUBSTRING(
PropertyAddress,
1,
CHARINDEX (',', PropertyAddress) -1
);
- Confirming that new columns were added and that the data is correct.
SELECT
PropertySplitAddress,
PropertySplitCity
FROM
PortfolioProject.dbo.Nashville_housing;

Using PARSENAME method, as PARSENAME only works with period/full stops ‘.’, the commas were replaced.
SELECT
OwnerAddress
FROM
PortfolioProject.dbo.Nashville_housing;

ALTER TABLE PortfolioProject.dbo.Nashville_housing Add OnwerSplitAddress Nvarchar (255);
UPDATE PortfolioProject.dbo.Nashville_housing
SET
OnwerSplitAddress = PARSENAME (REPLACE (OwnerAddress, ',', '.'), 3);
ALTER TABLE PortfolioProject.dbo.Nashville_housing Add OnwerSplitCity Nvarchar (255);
UPDATE PortfolioProject.dbo.Nashville_housing
SET
OnwerSplitCity = PARSENAME (REPLACE (OwnerAddress, ',', '.'), 2);
ALTER TABLE PortfolioProject.dbo.Nashville_housing Add OnwerSplitState Nvarchar (255);
UPDATE PortfolioProject.dbo.Nashville_housing
SET
OnwerSplitState = PARSENAME (REPLACE (OwnerAddress, ',', '.'), 1);
- Verifying that the new columns were added and contain the correct split address data.
SELECT
OnwerSplitAddress,
OnwerSplitCity,
OnwerSplitState
FROM
PortfolioProject.dbo.Nashville_housing;

Change “Y” and “N” to “Yes” and “No” in “Sold as Vacant” field
SELECT DISTINCT
(SoldAsVacant),
COUNT(SoldAsVacant)
FROM
PortfolioProject.dbo.Nashville_housing
GROUP BY
SoldAsVacant
ORDER BY
2;

UPDATE Nashville_housing
SET
SoldAsVacant = CASE
WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END;
SELECT DISTINCT
(SoldAsVacant),
COUNT(SoldAsVacant)
FROM
PortfolioProject.dbo.Nashville_housing
GROUP BY
SoldAsVacant
ORDER BY
2;

Remove Duplicates
WITH RowNumCTE As (
Select *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) as row_num
From PortfolioProject.dbo.Nashville_housing
--Order by ParcelID
)
SELECT COUNT(*) as number_of_duplicated_rows
FROM RowNumCTE
WHERE row_num > 1;
- 103 duplicated rows have been identified

- Usually, the imported data is not modified in such a way and instead all the data remaines with the manipulations being done on a view/temp table.
WITH RowNumCTE As (
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) as row_num
From PortfolioProject.dbo.Nashville_housing
--Order by ParcelID
)
DELETE
From RowNumCTE
Where row_num > 1;