Decode Base64 with Azure Synapse Analytics its Serverless SQL
 
    I wanted to analyze some data in Azure Synapse Analytics, Serverless SQL. This data was coming in from my IoT Hub that I created and was being routed towards an Azure Data Lake Storage account as small JSON files. However when trying to analyze this data, it appeared that IoT Hub shows the body as Base64 encoded and T-SQL couldn't analyze it.
Problem Statement
Normally, to convert base64 towards a string, we can use a built-in method where it converts this as XML. The following SQL statement can then be used:
SELECT
    [JSON] = cast(cast('' as XML).value('xs:base64Binary(sql:column("StackOverflow.Base64JSON"))', 'varbinary(30)') as varchar(30))
FROM my_table
However, when executing this in Synapse Analytics, Serverless SQL we get the following error:
xml is not supported
Solution
To solve this, we need to execute the following steps:
- Create a User Database (we don't have rights on the master database)
- Set the correct active Database
- Create a Base64 Decoder
- Load our JSON data
- Decode our Column
Breaking this down
1. Create a User Database
CREATE DATABASE MY_DATABASE;
2. Setting the correct active database
USE MY_DATABASE;
3. Creating the Base64 Decoder
CREATE FUNCTION dbo.tvf_Base64Decoder( @encodedPayload VARCHAR(8000) )
RETURNS TABLE
AS
RETURN WITH cte8Rows AS 
    (   
        -- generating 4096 rows, here are the first 8
        SELECT  ones = 1
        FROM    
            (
                values  (1),(1),(1),(1)
                    ,   (1),(1),(1),(1)
            ) AS tens(ones)
    )
    ,   cte64Rows AS 
    (   
        -- generating 4096 rows, here are the first 64
        SELECT  ones = 1
        FROM    cte8Rows       AS eight1 
        CROSS   JOIN cte8Rows  AS eight2
    )
    ,   cteSextet as
    (   
        -- a Base64 encoded string is comprised of 4-byte sextets, create an iterator of starting offsets for each sextet
        SELECT  top ((len(@encodedPayload) / 4)) 
                setextOffset = ((ROW_NUMBER() OVER (ORDER BY (SELECT 1))) * 4) -3
        FROM    cte64Rows      AS sixtyfour1 
        CROSS   JOIN cte64Rows AS sixtyfour2
    )
    ,   cteMapping as
    (   
        -- Base64 mapping table
        SELECT  Base64.[value]
            ,   Base64.[char]
        FROM 
            (   
                values  ( 0, 'A'), ( 1, 'B'), ( 2, 'C'), ( 3, 'D'), ( 4, 'E'), ( 5, 'F'), ( 6, 'G'), ( 7, 'H'), ( 8, 'I'), ( 9, 'J'), (10, 'K'), (11, 'L'), (12, 'M'), (13, 'N'), (14, 'O'), (15, 'P')
                    ,   (16, 'Q'), (17, 'R'), (18, 'S'), (19, 'T'), (20, 'U'), (21, 'V'), (22, 'W'), (23, 'X'), (24, 'Y'), (25, 'Z'), (26, 'a'), (27, 'b'), (28, 'c'), (29, 'd'), (30, 'e'), (31, 'f')
                    ,   (32, 'g'), (33, 'h'), (34, 'i'), (35, 'j'), (36, 'k'), (37, 'l'), (38, 'm'), (39, 'n'), (40, 'o'), (41, 'p'), (42, 'q'), (43, 'r'), (44, 's'), (45, 't'), (46, 'u'), (47, 'v')
                    ,   (48, 'w'), (49, 'x'), (50, 'y'), (51, 'z'), (52, '0'), (53, '1'), (54, '2'), (55, '3'), (56, '4'), (57, '5'), (58, '6'), (59, '7'), (60, '8'), (61, '9'), (62, '+'), (63, '/')
            ) AS Base64([value], [char])
    )
    ,   cteDecode as
    (   
        -- the fun stuff! (see explaination above)
        SELECT  chars =    concat(
            char(
                cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset   , 1) collate Latin1_General_CS_AS) *  4) AS BINARY(1)) AS TINYINT) 
                | 
                cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +1, 1) collate Latin1_General_CS_AS) / 16) AS BINARY(1)) AS TINYINT)
            )                  
            , char(
                cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +1, 1) collate Latin1_General_CS_AS) * 16) AS BINARY(1)) AS TINYINT) 
                | 
                cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +2, 1) collate Latin1_General_CS_AS) /  4) AS BINARY(1)) AS TINYINT)
            )
            , char(
                cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +2, 1) collate Latin1_General_CS_AS) * 64) AS BINARY(1)) AS TINYINT) 
                | 
                cast(cast(((SELECT m.[value] FROM cteMapping AS m WHERE m.[char] = SUBSTRING(@encodedPayload, s.setextOffset +3, 1) collate Latin1_General_CS_AS) /  1) AS BINARY(1)) AS TINYINT)
            )
        )
        FROM    cteSextet AS s
    )
    SELECT  decodedPayload = string_agg(chars, '')
    FROM    cteDecode;
4. Loading our JSON Data
SELECT
    JSON_VALUE(jsonContent, '$.Body') AS jsonBody
FROM
    OPENROWSET(
        BULK 'https://myadlsaccount.dfs.core.windows.net/myadlscontainer/myadlsfolder/*/*/*/*/*/*.json',
        FORMAT = 'CSV',
        FIELDQUOTE = '0x0b',
        FIELDTERMINATOR ='0x0b',
        ROWTERMINATOR = '0x0b'
    )
    WITH (
        jsonContent varchar(MAX)
    ) AS [result]
5. Decoding the Column
WITH decoder AS (
    SELECT
        JSON_VALUE(jsonContent, '$.Body') AS jsonBody
    FROM
        OPENROWSET(
            BULK 'https://myadlsaccount.dfs.core.windows.net/myadlscontainer/myadlsfolder/*/*/*/*/*/*.json',
            FORMAT = 'CSV',
            FIELDQUOTE = '0x0b',
            FIELDTERMINATOR ='0x0b',
            ROWTERMINATOR = '0x0b'
        )
        WITH (
            jsonContent varchar(MAX)
        ) AS [result]
)
SELECT 
    decoded = (SELECT decodedPayload FROM dbo.tvf_Base64Decoder(jsonBody))
FROM decoder
Conclusion
When you now combine those 5 steps into one SQL file, you will get a decoded JSON string containing your IoT Hub Body!
⚠ I personally would recommend utilizing Spark, seeing that this method works, but is not the fastest!
 
                    
Comments ()