Decode Base64 with Azure Synapse Analytics its Serverless SQL
I wanted to analyze some data in Azure Synapse Analytics, Serverless SQL. This data was coming in from my IoT Hub that I created and was being routed towards an Azure Data Lake Storage account as small JSON files. However when trying to analyze this data, it appeared that IoT Hub shows the body as Base64 encoded and T-SQL couldn't analyze it.
Problem Statement
Normally, to convert base64 towards a string, we can use a built-in method where it converts this as XML. The following SQL statement can then be used:
SELECT
[JSON] = cast(cast('' as XML).value('xs:base64Binary(sql:column("StackOverflow.Base64JSON"))', 'varbinary(30)') as varchar(30))
FROM my_table
However, when executing this in Synapse Analytics, Serverless SQL we get the following error:
xml is not supported
Solution
To solve this, we need to execute the following steps:
- Create a User Database (we don't have rights on the master database)
- Set the correct active Database
- Create a Base64 Decoder
- Load our JSON data
- Decode our Column
Breaking this down
1. Create a User Database
CREATE DATABASE MY_DATABASE;
2. Setting the correct active database
USE MY_DATABASE;
3. Creating the Base64 Decoder
CREATE FUNCTION dbo.tvf_Base64Decoder( @encodedPayload VARCHAR(8000) )
RETURNS TABLE
AS
RETURN WITH cte8Rows AS
(
-- generating 4096 rows, here are the first 8
SELECT ones = 1
FROM
(
values (1),(1),(1),(1)
, (1),(1),(1),(1)
) AS tens(ones)
)
, cte64Rows AS
(
-- generating 4096 rows, here are the first 64
SELECT ones = 1
FROM cte8Rows AS eight1
CROSS JOIN cte8Rows AS eight2
)
, cteSextet as
(
-- a Base64 encoded string is comprised of 4-byte sextets, create an iterator of starting offsets for each sextet
SELECT top ((len(@encodedPayload) / 4))
setextOffset = ((ROW_NUMBER() OVER (ORDER BY (SELECT 1))) * 4) -3
FROM cte64Rows AS sixtyfour1
CROSS JOIN cte64Rows AS sixtyfour2
)
, cteMapping as
(
-- Base64 mapping table
SELECT Base64.[value]
, Base64.[char]
FROM
(
values ( 0, 'A'), ( 1, 'B'), ( 2, 'C'), ( 3, 'D'), ( 4, 'E'), ( 5, 'F'), ( 6, 'G'), ( 7, 'H'), ( 8, 'I'), ( 9, 'J'), (10, 'K'), (11, 'L'), (12, 'M'), (13, 'N'), (14, 'O'), (15, 'P')
, (16, 'Q'), (17, 'R'), (18, 'S'), (19, 'T'), (20, 'U'), (21, 'V'), (22, 'W'), (23, 'X'), (24, 'Y'), (25, 'Z'), (26, 'a'), (27, 'b'), (28, 'c'), (29, 'd'), (30, 'e'), (31, 'f')
, (32, 'g'), (33, 'h'), (34, 'i'), (35, 'j'), (36, 'k'), (37, 'l'), (38, 'm'), (39, 'n'), (40, 'o'), (41, 'p'), (42, 'q'), (43, 'r'), (44, 's'), (45, 't'), (46, 'u'), (47, 'v')
, (48, 'w'), (49, 'x'), (50, 'y'), (51, 'z'), (52, '0'), (53, '1'), (54, '2'), (55, '3'), (56, '4'), (57, '5'), (58, '6'), (59, '7'), (60, '8'), (61, '9'), (62, '+'), (63, '/')
) AS Base64([value], [char])
)
, cteDecode as
(
-- the fun stuff! (see explaination above)
SELECT chars = concat(
char(
cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset , 1) collate Latin1_General_CS_AS) * 4) AS BINARY(1)) AS TINYINT)
|
cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +1, 1) collate Latin1_General_CS_AS) / 16) AS BINARY(1)) AS TINYINT)
)
, char(
cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +1, 1) collate Latin1_General_CS_AS) * 16) AS BINARY(1)) AS TINYINT)
|
cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +2, 1) collate Latin1_General_CS_AS) / 4) AS BINARY(1)) AS TINYINT)
)
, char(
cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +2, 1) collate Latin1_General_CS_AS) * 64) AS BINARY(1)) AS TINYINT)
|
cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +3, 1) collate Latin1_General_CS_AS) / 1) AS BINARY(1)) AS TINYINT)
)
)
FROM cteSextet AS s
)
SELECT decodedPayload = string_agg(chars, '')
FROM cteDecode;
4. Loading our JSON Data
SELECT
JSON_VALUE(jsonContent, '$.Body') AS jsonBody
FROM
OPENROWSET(
BULK 'https://myadlsaccount.dfs.core.windows.net/myadlscontainer/myadlsfolder/*/*/*/*/*/*.json',
FORMAT = 'CSV',
FIELDQUOTE = '0x0b',
FIELDTERMINATOR ='0x0b',
ROWTERMINATOR = '0x0b'
)
WITH (
jsonContent varchar(MAX)
) AS [result]
5. Decoding the Column
WITH decoder AS (
SELECT
JSON_VALUE(jsonContent, '$.Body') AS jsonBody
FROM
OPENROWSET(
BULK 'https://myadlsaccount.dfs.core.windows.net/myadlscontainer/myadlsfolder/*/*/*/*/*/*.json',
FORMAT = 'CSV',
FIELDQUOTE = '0x0b',
FIELDTERMINATOR ='0x0b',
ROWTERMINATOR = '0x0b'
)
WITH (
jsonContent varchar(MAX)
) AS [result]
)
SELECT
decoded = (SELECT decodedPayload FROM dbo.tvf_Base64Decoder(jsonBody))
FROM decoder
Conclusion
When you now combine those 5 steps into one SQL file, you will get a decoded JSON string containing your IoT Hub Body!
⚠ I personally would recommend utilizing Spark, seeing that this method works, but is not the fastest!