rm -r dp-203 -f
git clone https://github.com/MicrosoftLearning/dp-203-azure-data-engineer dp-203
cd dp-203/Allfiles/labs/09
./setup.ps1
SELECT COUNT(1)
FROM dbo.StageProduct
COPY INTO dbo.StageProduct
(ProductID, ProductName, ProductCategory, Color, Size, ListPrice, Discontinued)
FROM 'https://datalakexxxxxx.blob.core.windows.net/files/data/Product.csv'
WITH
(
FILE_TYPE = 'CSV',
MAXERRORS = 0,
IDENTITY_INSERT = 'OFF',
FIRSTROW = 2 --Skip header row
);
SELECT COUNT(1)
FROM dbo.StageProduct
COPY INTO dbo.StageCustomer
(GeographyKey, CustomerAlternateKey, Title, FirstName, MiddleName, LastName, NameStyle, BirthDate,
MaritalStatus, Suffix, Gender, EmailAddress, YearlyIncome, TotalChildren, NumberChildrenAtHome, EnglishEducation,
SpanishEducation, FrenchEducation, EnglishOccupation, SpanishOccupation, FrenchOccupation, HouseOwnerFlag,
NumberCarsOwned, AddressLine1, AddressLine2, Phone, DateFirstPurchase, CommuteDistance)
FROM 'https://datalakexxxxxx.dfs.core.windows.net/files/data/Customer.csv'
WITH
(
FILE_TYPE = 'CSV'
,MAXERRORS = 5
,FIRSTROW = 2 -- skip header row
,ERRORFILE = 'https://datalakexxxxxx.dfs.core.windows.net/files/'
);
SELECT *
FROM dbo.StageCustomer
On the files tab, view the root folder of your data lake and verify that a new folder named _rejectedrows has been created (if you don’t see this folder, in the More menu, select Refresh to refresh the view).
Open the _rejectedrows folder and the date and time specific subfolder it contains, and note that files with names similar to QID123_1_2.Error.Txt and QID123_1_2.Row.Txt have been created. You can right-click each of these files and select Preview to see details of the error and the row that was rejected.
The use of staging tables enables you to validate or transform data before moving or using it to append to or upsert into any existing dimension tables. The COPY statement provides a simple but high-performance technique that you can use to easily load data from files in a data lake into staging tables, and as you’ve seen, identify and redirect invalid rows.
CREATE TABLE dbo.DimProduct
WITH
(
DISTRIBUTION = HASH(ProductAltKey),
CLUSTERED COLUMNSTORE INDEX
)
AS
SELECT ROW_NUMBER() OVER(ORDER BY ProductID) AS ProductKey,
ProductID AS ProductAltKey,
ProductName,
ProductCategory,
Color,
Size,
ListPrice,
Discontinued
FROM dbo.StageProduct;
SELECT ProductKey,
ProductAltKey,
ProductName,
ProductCategory,
Color,
Size,
ListPrice,
Discontinued
FROM dbo.DimProduct;
INSERT INTO dbo.DimCustomer ([GeographyKey],[CustomerAlternateKey],[Title],[FirstName],[MiddleName],[LastName],[NameStyle],[BirthDate],[MaritalStatus],
[Suffix],[Gender],[EmailAddress],[YearlyIncome],[TotalChildren],[NumberChildrenAtHome],[EnglishEducation],[SpanishEducation],[FrenchEducation],
[EnglishOccupation],[SpanishOccupation],[FrenchOccupation],[HouseOwnerFlag],[NumberCarsOwned],[AddressLine1],[AddressLine2],[Phone],
[DateFirstPurchase],[CommuteDistance])
SELECT *
FROM dbo.StageCustomer AS stg
WHERE NOT EXISTS
(SELECT * FROM dbo.DimCustomer AS dim
WHERE dim.CustomerAlternateKey = stg.CustomerAlternateKey);
-- Type 1 updates (change name, email, or phone in place)
UPDATE dbo.DimCustomer
SET LastName = stg.LastName,
EmailAddress = stg.EmailAddress,
Phone = stg.Phone
FROM DimCustomer dim inner join StageCustomer stg
ON dim.CustomerAlternateKey = stg.CustomerAlternateKey
WHERE dim.LastName <> stg.LastName OR dim.EmailAddress <> stg.EmailAddress OR dim.Phone <> stg.Phone
-- Type 2 updates (address changes triggers new entry)
INSERT INTO dbo.DimCustomer
SELECT stg.GeographyKey,stg.CustomerAlternateKey,stg.Title,stg.FirstName,stg.MiddleName,stg.LastName,stg.NameStyle,stg.BirthDate,stg.MaritalStatus,
stg.Suffix,stg.Gender,stg.EmailAddress,stg.YearlyIncome,stg.TotalChildren,stg.NumberChildrenAtHome,stg.EnglishEducation,stg.SpanishEducation,stg.FrenchEducation,
stg.EnglishOccupation,stg.SpanishOccupation,stg.FrenchOccupation,stg.HouseOwnerFlag,stg.NumberCarsOwned,stg.AddressLine1,stg.AddressLine2,stg.Phone,
stg.DateFirstPurchase,stg.CommuteDistance
FROM dbo.StageCustomer AS stg
JOIN dbo.DimCustomer AS dim
ON stg.CustomerAlternateKey = dim.CustomerAlternateKey
AND stg.AddressLine1 <> dim.AddressLine1;
ALTER INDEX ALL ON dbo.DimProduct REBUILD;
CREATE STATISTICS customergeo_stats
ON dbo.DimCustomer (GeographyKey);