Load Data Warehouse

rm -r dp-203 -f

git clone https://github.com/MicrosoftLearning/dp-203-azure-data-engineer dp-203

cd dp-203/Allfiles/labs/09

./setup.ps1

SELECT COUNT(1) 
 FROM dbo.StageProduct

 COPY INTO dbo.StageProduct
     (ProductID, ProductName, ProductCategory, Color, Size, ListPrice, Discontinued)
 FROM 'https://datalakexxxxxx.blob.core.windows.net/files/data/Product.csv'
 WITH
 (
     FILE_TYPE = 'CSV',
     MAXERRORS = 0,
     IDENTITY_INSERT = 'OFF',
     FIRSTROW = 2 --Skip header row
 );


 SELECT COUNT(1) 
 FROM dbo.StageProduct

 COPY INTO dbo.StageCustomer
 (GeographyKey, CustomerAlternateKey, Title, FirstName, MiddleName, LastName, NameStyle, BirthDate, 
 MaritalStatus, Suffix, Gender, EmailAddress, YearlyIncome, TotalChildren, NumberChildrenAtHome, EnglishEducation, 
 SpanishEducation, FrenchEducation, EnglishOccupation, SpanishOccupation, FrenchOccupation, HouseOwnerFlag, 
 NumberCarsOwned, AddressLine1, AddressLine2, Phone, DateFirstPurchase, CommuteDistance)
 FROM 'https://datalakexxxxxx.dfs.core.windows.net/files/data/Customer.csv'
 WITH
 (
 FILE_TYPE = 'CSV'
 ,MAXERRORS = 5
 ,FIRSTROW = 2 -- skip header row
 ,ERRORFILE = 'https://datalakexxxxxx.dfs.core.windows.net/files/'
 );

 SELECT *
 FROM dbo.StageCustomer

On the files tab, view the root folder of your data lake and verify that a new folder named _rejectedrows has been created (if you don’t see this folder, in the More menu, select Refresh to refresh the view).

Open the _rejectedrows folder and the date and time specific subfolder it contains, and note that files with names similar to QID123_1_2.Error.Txt and QID123_1_2.Row.Txt have been created. You can right-click each of these files and select Preview to see details of the error and the row that was rejected.

The use of staging tables enables you to validate or transform data before moving or using it to append to or upsert into any existing dimension tables. The COPY statement provides a simple but high-performance technique that you can use to easily load data from files in a data lake into staging tables, and as you’ve seen, identify and redirect invalid rows.

 CREATE TABLE dbo.DimProduct
 WITH
 (
     DISTRIBUTION = HASH(ProductAltKey),
     CLUSTERED COLUMNSTORE INDEX
 )
 AS
 SELECT ROW_NUMBER() OVER(ORDER BY ProductID) AS ProductKey,
     ProductID AS ProductAltKey,
     ProductName,
     ProductCategory,
     Color,
     Size,
     ListPrice,
     Discontinued
 FROM dbo.StageProduct;

 SELECT ProductKey,
     ProductAltKey,
     ProductName,
     ProductCategory,
     Color,
     Size,
     ListPrice,
     Discontinued
 FROM dbo.DimProduct;

 INSERT INTO dbo.DimCustomer ([GeographyKey],[CustomerAlternateKey],[Title],[FirstName],[MiddleName],[LastName],[NameStyle],[BirthDate],[MaritalStatus],
 [Suffix],[Gender],[EmailAddress],[YearlyIncome],[TotalChildren],[NumberChildrenAtHome],[EnglishEducation],[SpanishEducation],[FrenchEducation],
 [EnglishOccupation],[SpanishOccupation],[FrenchOccupation],[HouseOwnerFlag],[NumberCarsOwned],[AddressLine1],[AddressLine2],[Phone],
 [DateFirstPurchase],[CommuteDistance])
 SELECT *
 FROM dbo.StageCustomer AS stg
 WHERE NOT EXISTS
     (SELECT * FROM dbo.DimCustomer AS dim
     WHERE dim.CustomerAlternateKey = stg.CustomerAlternateKey);

 -- Type 1 updates (change name, email, or phone in place)
 UPDATE dbo.DimCustomer
 SET LastName = stg.LastName,
     EmailAddress = stg.EmailAddress,
     Phone = stg.Phone
 FROM DimCustomer dim inner join StageCustomer stg
 ON dim.CustomerAlternateKey = stg.CustomerAlternateKey
 WHERE dim.LastName <> stg.LastName OR dim.EmailAddress <> stg.EmailAddress OR dim.Phone <> stg.Phone

 -- Type 2 updates (address changes triggers new entry)
 INSERT INTO dbo.DimCustomer
 SELECT stg.GeographyKey,stg.CustomerAlternateKey,stg.Title,stg.FirstName,stg.MiddleName,stg.LastName,stg.NameStyle,stg.BirthDate,stg.MaritalStatus,
 stg.Suffix,stg.Gender,stg.EmailAddress,stg.YearlyIncome,stg.TotalChildren,stg.NumberChildrenAtHome,stg.EnglishEducation,stg.SpanishEducation,stg.FrenchEducation,
 stg.EnglishOccupation,stg.SpanishOccupation,stg.FrenchOccupation,stg.HouseOwnerFlag,stg.NumberCarsOwned,stg.AddressLine1,stg.AddressLine2,stg.Phone,
 stg.DateFirstPurchase,stg.CommuteDistance
 FROM dbo.StageCustomer AS stg
 JOIN dbo.DimCustomer AS dim
 ON stg.CustomerAlternateKey = dim.CustomerAlternateKey
 AND stg.AddressLine1 <> dim.AddressLine1;

ALTER INDEX ALL ON dbo.DimProduct REBUILD;

CREATE STATISTICS customergeo_stats
 ON dbo.DimCustomer (GeographyKey);

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值