0

I'm using a REST API to retrieve data from an Azure Table using the code below:

library(httr)
library(RCurl)
library(bitops)
library(xml2)

# Stores credentials in variable

Account <- "storageaccount"
Container <- "Usage"
Key <- "key"


# Composes URL

URL <- paste0(
          "https://", 
          Account, 
          ".table.core.windows.net", 
          "/", 
          Container
        )

# Requests time stamp

requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")


# As per Microsoft's specs, an empty line is needed for content-length

content_lenght <- 0

# Composes signature string

signature_string <- paste0(
                     "GET", "\n",                 # HTTP Verb
                     "\n",                        # Content-MD-5
                     "text/xml", "\n",            # Content-Type
                     requestdate, "\n",           # Date
                     "/", Account, "/", Container # Canonicalized resource
                    )

# Composes header string

header_string <- add_headers(
                    Authorization=paste0(
                      "SharedKey ", 
                      Account, 
                      ":",
                      RCurl::base64(
                        digest::hmac(
                          key = RCurl::base64Decode(
                            Key, mode = "raw"
                          ),
                          object = enc2utf8(signature_string),
                          algo = "sha256", 
                          raw = TRUE
                            )
                        )
                    ),
                    'x-ms-date' = requestdate, 
                    'x-ms-version' = "2020-12-06",
                    'Content-type' = "text/xml"
                  )

# Creates request

xml_body = content(
                GET(
                  URL, 
                  config = header_string, 
                  verbose()
                  ),
                "text"
                )

Get_data <- xml_body                             # Gets data as text from API  
From_JSON <-fromJSON(Get_data, flatten = TRUE)   # Parses text from JSON
Table_name <- as.data.frame(From_JSON)           # Saves data to a table

I can now view the table, but I noted that I can only see the first 1000 rows. What's the most efficient way to implement a loop/cycle that retrieves all the remaining rows and updates the table?

I need to be able to work on the entire dataset.

Also consider that this table will be updated with ~40,000 rows per day, so keeping the visuals current with the data is a concern.

Thanks in advance for your suggestions!

~Alienvolm

Ansuman Bal
  • 9,705
  • 2
  • 10
  • 27
Alienvolm
  • 143
  • 9

3 Answers3

1

Not sure how you would implement this in R specifically but here is the general approach:

When you list entities from a table, a maximum of 1000 entities are returned in a single request. If the table contains more than 1000 entities, Table Service will return two additional headers: x-ms-continuation-NextPartitionKey and x-ms-continuation-NextRowKey. Presence of these two headers indicate that there's more data available for you to fetch.

What you would need to do is use these headers and specify two query parameters in your next request URL: NextPartitionKey and NextRowKey. So your request would be something like:

https://account.table.core.windows.net/Table?NextPartitionKey=<x-ms-continuation-NextPartitionKey header value>&NextRowKey=<x-ms-continuation-NextRowKey header value>.

You would need to repeat the process till the time you do not get these headers in the response.

You can learn more about it here: https://learn.microsoft.com/en-us/rest/api/storageservices/query-timeout-and-pagination.

Gaurav Mantri
  • 128,066
  • 12
  • 206
  • 241
0

Thanks for the hint! I have put together some code... Unfortunately, I don't get past the first loop (to retrieve page 3) and I don't quite get why.

I composed it by making some assumptions such as, that the metadata is returned always with the same structure.

This is the code:

library(httr)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)


# Retrieves metadata
Get_headers <- capture.output(
                 content(
                   GET(
                   URL, 
                   config = header_string, 
                   verbose()
                  )
                 ),
              type = "message")

Server_response <- Get_headers[11] %>%
                    trimws( whitespace = "\r") %>% 
                     trimws( whitespace = "<- ") %>%
                      grepl("HTTP/1.1 200 OK")


# Initializes variables
Pages <- 0
Next_headers_count <- 0

# Fetches data only if authentication was successful

if (Server_response = TRUE) {
   
   Get_data <- xml_body                             # Gets data as text from API  
   From_JSON <-fromJSON(Get_data, flatten = TRUE)   # Parses text from JSON
   Table_name <- as.data.frame(From_JSON)           # Saves data to a table
   Pages <- Pages + 1                               # One page of data has been retrieved
   
   # Checks if there are more than 1000 rows to be fetched
   
   x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%                                              
                                         trimws( whitespace = "<- ") %>%
                                            gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
                                              grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)

   x_ms_continuation_NextRowKey <- Get_headers[20] %>%
                                  trimws( whitespace = "<- ") %>% 
                                    gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
                                       grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
   
   # Starts loop to retrieve additional data
   
   while (x_ms_continuation_NextPartitionKey = TRUE & 
          x_ms_continuation_NextRowKey = TRUE) {
             
             Pages <- Pages + 1                                      # Counts the number of pages retrieved, including the initial page
             Next_headers_count <- Next_headers_count +1             # Counts the number of Next headers passed by the metadata
             
             Next_Partition_Key <- Get_headers[19] %>%               # Extracts the value of the Next Partition Key
                                     str_remove(".+(?= )") %>%       
                                       trimws( whitespace =" ") %>%
                                         trimws( whitespace = "\r")

             Next_Row_key <- Get_headers[20] %>%                     # Extracts the value of the Next Row Key
                                str_remove(".+(?= )") %>% 
                                  trimws( whitespace =" ") %>%
                                    trimws( whitespace = "\r")
   
             Next_URL <- paste0(                                     # Creates the URL for the Next Authentication token
                       "https://", 
                        Account, 
                        ".table.core.windows.net", 
                        "/", 
                        Container, 
                         "?", 
                       "NextPartitionKey=", 
                        Next_Partition_Key, 
                        "&NextRowKey=",
                        Next_Row_key
                    )
    
              next_xml_body = content(                            # Retrieves next 1000 rows of content from table
                                  GET(
                                    Next_URL, 
                                    config = header_string, 
                                    verbose()
                                   ),
                                  "text"
                                )
             
             Get_new_data <- next_xml_body                             # Gets data as text from API 
             From_JSON <-fromJSON(Get_new_data, flatten = TRUE)        # Parses text from JSON
             Temp_table_name <- as.data.frame(From_JSON)               # Saves data to a table
             Table_name <- bind_rows(Temp_table_name, Table_name)      # Appends new data to the initial data

             Get_new_headers <- capture.output(                        # Retrieves new next headers
                                 content(
                                   GET(
                                   Next_URL, 
                                   config = header_string, 
                                   verbose()
                                    )
                                   ),
                                 type = "message")
             
             New_server_response <- Get_new_headers[11] %>%
                                     trimws( whitespace = "\r") %>% 
                                      trimws( whitespace = "<- ") %>%
                                         grepl("HTTP/1.1 200 OK")
             
             # Checks if there are more than 1000 rows to be fetched
   
             New_x_ms_continuation_NextPartitionKey <- Get_new_headers[19] %>%                                              
                                                      trimws( whitespace = "<- ") %>%
                                                        gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
                                                          grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)

             New_x_ms_continuation_NextRowKey <- Get_new_headers[20] %>%
                                                trimws( whitespace = "<- ") %>% 
                                                  gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
                                                    grepl("x-ms-continuation-NextRowKey", fixed = TRUE)
             
             x_ms_continuation_NextPartitionKey <- New_x_ms_continuation_NextPartitionKey
             x_ms_continuation_NextRowKey <- New_x_ms_continuation_NextRowKey
             
             Next_Partition_Key <- Get_new_headers[19] %>%               # Extracts the value of the Next Partition Key
                                     str_remove(".+(?= )") %>%       
                                       trimws( whitespace =" ") %>%
                                         trimws( whitespace = "\r")

             Next_Row_key <- Get_new_headers[20] %>%                     # Extracts the value of the Next Row Key
                                str_remove(".+(?= )") %>% 
                                  trimws( whitespace =" ") %>%
                                    trimws( whitespace = "\r")
             
   } 
   
} else {print("authentication failed")}

# Previews table
Pages
Next_headers_count
View(Table_name)

With this, I can retrieve only 2000 entries. When the next cycle starts it fails. It seems that it fails here:

Get_new_headers <- capture.output(                        # Retrieves new next headers
                                 content(
                                   GET(
                                   Next_URL, 
                                   config = header_string, 
                                   verbose()
                                    )
                                   ),
                                 type = "message")

This is the error: enter image description here

Any help would be greatly appreciated!

Alienvolm
  • 143
  • 9
0

I figured it out... I improved the script and now it's working. :-)

The code can be further refined, but this is a working script that retrieves all the data from the table in iterations.

Connects to an Azure Table based on the specifications for Shared Key: https://learn.microsoft.com/en-us/rest/api/storageservices/authorize-with-shared-key


library(httr)
library(RCurl)
library(bitops)
library(xml2)
library(jsonlite)
library(stringr)
library(dplyr)
library(tidyr)

# Stores credentials in variable

Account <- "storage"
Container <- "Usage"
Key <- "key"


# Composes URL

URL <- paste0(
          "https://", 
          Account, 
          ".table.core.windows.net", 
          "/", 
          Container
        )

# Requests time stamp

requestdate <- format(Sys.time(), "%a, %d %b %Y %H:%M:%S %Z", tz = "GMT")


# As per Microsoft's specs, an empty line is needed for content-length

content_lenght <- 0

# Composes signature string

signature_string <- paste0(
                     "GET", "\n",                 # HTTP Verb
                     "\n",                        # Content-MD-5
                     "text/xml", "\n",            # Content-Type
                     requestdate, "\n",           # Date
                     "/", Account, "/", Container # Canonicalized resource
                    )

# Composes header string

header_string <- add_headers(
                    Authorization=paste0(
                      "SharedKey ", 
                      Account, 
                      ":",
                      RCurl::base64(
                        digest::hmac(
                          key = RCurl::base64Decode(
                            Key, mode = "raw"
                          ),
                          object = enc2utf8(signature_string),
                          algo = "sha256", 
                          raw = TRUE
                            )
                        )
                    ),
                    'x-ms-date' = requestdate, 
                    'x-ms-version' = "2020-12-06",
                    'Content-type' = "text/xml"
                  )


# Calls


Get_headers <- capture.output(                                          # Retrieves metadata
                  content(
                    GET(
                      URL, 
                      config = header_string,
                      verbose()
                    )
                  ), 
                 type = "message"
               )       

Server_response <- Get_headers[11] %>%                                    # Retrieves server response
                         trimws( whitespace = "\r") %>% 
                            trimws( whitespace = "<- ") %>%
                               grepl("HTTP/1.1 200 OK")

Get_headers
Server_response

# Initializes counters

Pages <- 0
Next_headers_count <- 0

  while(isTRUE(Server_response)) {
    
        Pages <- Pages + 1
    
        xml_body <- content(                                       # Retrieves up to 1000 rows from the table
                        GET(
                          URL, 
                          config = header_string, 
                           verbose()
                         ), 
                        "text"
                      ) 
        
        Get_data <- xml_body                                      # Gets data as text from API  
        From_JSON <-fromJSON(Get_data, flatten = TRUE)            # Parses text from JSON
        Temp_table_name <- as.data.frame(From_JSON)               # Saves current rows to temp table
        Table_name <- bind_rows(Temp_table_name, Table_name)      # Appends new data to the initial data
    
        # Checks if there are more than 1000 rows to be fetched
    
        x_ms_continuation_NextPartitionKey <- Get_headers[19] %>%                                              
                                                trimws( whitespace = "<- ") %>%
                                                  gsub("\\.*", "x-ms-continuation-NextPartitionKey") %>%
                                                    grepl("x-ms-continuation-NextPartitionKey", fixed = TRUE)
   
        x_ms_continuation_NextRowKey <- Get_headers[20] %>%
                                          trimws( whitespace = "<- ") %>% 
                                             gsub("\\.*", "x-ms-continuation-NextRowKey") %>%
                                               grepl("x-ms-continuation-NextRowKey", fixed = TRUE) 
        
        x_ms_continuation_NextPartitionKey
        x_ms_continuation_NextRowKey
        
          if (isTRUE(x_ms_continuation_NextPartitionKey) & 
                 isTRUE(x_ms_continuation_NextRowKey)) {
          
                 Next_headers_count <- Next_headers_count + 1
          
                 Next_Partition_Key <- Get_headers[19] %>%                    # Extracts the value of the Next Partition Key
                                          str_remove(".+(?= )") %>%       
                                            trimws( whitespace =" ") %>%
                                              trimws( whitespace = "\r")
             
                 Next_Row_key <- Get_headers[20] %>%                          # Extracts the value of the Next Row Key
                                     str_remove(".+(?= )") %>% 
                                        trimws( whitespace =" ") %>%
                                           trimws( whitespace = "\r")
          
                 URL <- paste0(                                               # Creates the URL for the Next Authentication token
                         "https://", 
                          Account, 
                         ".table.core.windows.net", 
                         "/", 
                         Container, 
                         "?", 
                        "NextPartitionKey=", 
                         Next_Partition_Key, 
                         "&NextRowKey=",
                         Next_Row_key
                        )
           
               
                  Get_headers <- capture.output(                                          # Retrieves new metadata
                                       content(
                                             GET(
                                               URL, 
                                               config = header_string,
                                               verbose()
                                             )
                                          ), 
                                        type = "message"
                                   )       

                   Server_response <- Get_headers[11] %>%                                    # Retrieves new server response
                                          trimws( whitespace = "\r") %>% 
                                             trimws( whitespace = "<- ") %>%
                                                 grepl("HTTP/1.1 200 OK")
          }
        
  }

Pages
Next_headers_count
View(Table_name)
Alienvolm
  • 143
  • 9