404 error on downloading public box folders. Python API is finding the file names though

A government agency publishes a public folder to share a data set. I have no relationship with the publisher, thus no credentials.
With the web browser interface, no authentication is required, works really well, can download every file.

The Python SDK is getting super difficult for the task of accessing a public box folder.
I can retrieve a list the file and folder names
When I try to get anything with the file object file(file_id=item.id).content … download_url(), download…

I get a 404 error, Not found.

The assumptions I’m making:

  1. I need a developer account
  2. I need to create a custom app
  3. Select an auth method (e.g. OAuth)
    3.5 Download the 3 secrets (client_id, client_secret, token) from custom app.
  4. Build a client object with the OAuth
    5 With either the URL or the folderID get an object to the root folder.
  5. Iterate over objects in the root_folder

Based on one post from Box engineer, I get the sense these developer accounts and credentials don’t give you “Permissions” on public folders. No info was given on how to proceed.

Is there a solution to automating downloads of public folders?
Thanks!

Code

client = Client(auth)
SHARED_URL = "https://nihcc.app.box.com/v/ChestXray-NIHCC"
root_folder =  client.get_shared_item(SHARED_URL, '')
print(root_folder.name) # prints CXR8
items = root_folder.get_items()

for item in items:
    print('{0} {1} is named "{2}"'.format(item.type.capitalize(), item.id, item.name))
    # prints: File 939374043869 is named "AAA Job Opportunity!!! Cloud Computing - Masters or PhD Degree.pdf"
    if item.type == 'file':
        file_name = F"{download_folder}/{item.name}"
        print(item.id) # prints: 939374043869
        download_url = client.file(item.id).get_download_url(). # fail 404 error
1 Like

Hi @dmoore247 , welcome to the forum!

Yes this can be a little confusing…

I’m not sure which Python SDK you are using, but with the Next Gen one, the issue is that you need to send the original shared link to download the files.

I have prepared an example for you:

import os
import shutil
from io import BufferedIOBase
import dotenv
from box_sdk_gen import (
    BoxCCGAuth,
    CCGConfig,
    BoxClient as Client,
    BoxAPIError,
    Items,
)
from box_sdk_gen import DownloadsManager

ENV_CCG = ".ccg.env"


class ConfigCCG:
    """application configurations"""

    def __init__(self) -> None:
        dotenv.load_dotenv(ENV_CCG)
        # Common configurations
        self.client_id = os.getenv("CLIENT_ID")
        self.client_secret = os.getenv("CLIENT_SECRET")

        # CCG configurations
        self.enterprise_id = os.getenv("ENTERPRISE_ID")
        self.ccg_user_id = os.getenv("CCG_USER_ID")

        self.cache_file = os.getenv("CACHE_FILE", ".ccg.tk")


def main():

    conf = ConfigCCG()

    ccg_conf = CCGConfig(
        client_id=conf.client_id,
        client_secret=conf.client_secret,
        enterprise_id=conf.enterprise_id,
    )
    auth = BoxCCGAuth(ccg_conf)
    client = Client(auth)

    web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"

    user = client.users.get_user_me()
    print(f"User: {user.id}:{user.name}")

    try:
        shared_folder = (
            client.shared_links_folders.find_folder_for_shared_link(
                boxapi="shared_link=" + web_link_url
            )
        )
        print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")
        print("#" * 80)

        print("Type\tID\t\tName")
        os.chdir("downloads")
        items = client.folders.get_folder_items(
            shared_folder.id, boxapi="shared_link=" + web_link_url
        )
        download_items(client, items, web_link_url)
        os.chdir("..")
    except BoxAPIError as e:
        print(f"Error: {e}")


def download_items(client: Client, items: Items, web_link_url):

    for item in items.entries:
        if item.type == "folder":
            if not os.path.exists(item.name):
                os.mkdir(item.name)
            os.chdir(item.name)
            # print the folder name
            print("-" * 80)
            print(f"\n\n{item.type.value}\t{item.id}\t{item.name}")
            print("-" * 80)
            items = client.folders.get_folder_items(
                item.id, boxapi="shared_link=" + web_link_url
            )
            download_items(client, items, web_link_url)
            os.chdir("..")

        if item.type == "file":
            print(f"{item.type.value}\t{item.id}\t{item.name}", end="")

            # check if item name ends with .tar.gz
            if item.name.endswith(".tar.gz"):
                print("\t .tar.gz skipped")
                continue
            file_content_stream: BufferedIOBase = (
                client.downloads.download_file(
                    item.id, boxapi="shared_link=" + web_link_url
                )
            )
            with open(item.name, "wb") as f:
                shutil.copyfileobj(file_content_stream, f)

            print("\tdone")


if __name__ == "__main__":
    main()
    print("Done")

The result is (I’ve skipped the big .tar.gz files):

User: 20706451735:CCG
Shared Folder: 36938765345:CXR8
################################################################################
Type    ID              Name
--------------------------------------------------------------------------------


folder  37178474737     images
--------------------------------------------------------------------------------
file    371647823217    batch_download_zips.py  done
file    219764235225    images_001.tar.gz        .tar.gz skipped
file    219767703471    images_002.tar.gz        .tar.gz skipped
file    219770039352    images_003.tar.gz        .tar.gz skipped
file    221185642661    images_004.tar.gz        .tar.gz skipped
file    219776556743    images_005.tar.gz        .tar.gz skipped
file    219777758783    images_006.tar.gz        .tar.gz skipped
file    220610700915    images_007.tar.gz        .tar.gz skipped
file    219776273384    images_008.tar.gz        .tar.gz skipped
file    219782291318    images_009.tar.gz        .tar.gz skipped
file    219781375034    images_010.tar.gz        .tar.gz skipped
file    219777519815    images_011.tar.gz        .tar.gz skipped
file    219778785923    images_012.tar.gz        .tar.gz skipped
--------------------------------------------------------------------------------


folder  174256157515    LongTailCXR
--------------------------------------------------------------------------------
file    1022679833262   nih-cxr-lt_image_ids.csv        done
file    1022664274647   nih-cxr-lt_single-label_balanced-test.csv       done
file    1022634602877   nih-cxr-lt_single-label_balanced-val.csv        done
file    1022664681945   nih-cxr-lt_single-label_test.csv        done
file    1022681300213   nih-cxr-lt_single-label_train.csv       done
file    1022683738717   README.txt      done
--------------------------------------------------------------------------------


folder  223604149466    PruneCXR
--------------------------------------------------------------------------------
file    1292084530974   miccai2023_nih-cxr-lt_labels_test.csv   done
file    1292081161269   miccai2023_nih-cxr-lt_labels_train.csv  done
file    1292096337058   miccai2023_nih-cxr-lt_labels_val.csv    done
file    1292097450400   README.txt      done
file    939374043869    AAA Job Opportunity!!! Cloud Computing - Masters or PhD Degree.pdf      done
file    1001272740624   AAA Physician AI Research Opportunity!!!.pdf    done
file    906187165990    AAA Postdoctoral Fellowship Opportunity!!! - NIH Medical Image Analysis Postdoc.pdf     done
file    256057377774    ARXIV_V5_CHESTXRAY.pdf  done
file    219760940956    BBox_List_2017.csv      done
file    219760887468    Data_Entry_2017_v2020.csv       done
file    249502714403    FAQ_CHESTXRAY.pdf       done
file    249505703122    LOG_CHESTXRAY.pdf       done
file    220660789610    README_CHESTXRAY.pdf    done
file    256055473534    test_list.txt   done
file    256056636701    train_val_list.txt      done
Done

The trick in this Gen SDK is this line:

items = client.folders.get_folder_items(
                item.id, boxapi="shared_link=" + web_link_url
            )

It sends the shared link information, so it can be used as security context, otherwise it is looking for the folder/file in your own box instance, as opposed to NIHCC.

Let me know if this helps.

Best regards

Hi @dmoore247

I was looking into the differences between the classic and next gen sdk.

So here is the same example for the classic SDK:

"""demo to download files from a box web link"""

import os
from boxsdk import JWTAuth, Client


def main():
    auth = JWTAuth.from_settings_file(".jwt.config.json")
    auth.authenticate_instance()
    client = Client(auth)

    web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"

    user = client.user().get()
    print(f"User: {user.id}:{user.name}")

    shared_folder = client.get_shared_item(web_link_url, "")
    print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")
    print("#" * 80)

    print("Type\tID\t\tName")
    os.chdir("downloads")
    items = shared_folder.get_items()
    download_items(items)
    os.chdir("..")


def download_items(items):

    for item in items:
        if item.type == "folder":
            if not os.path.exists(item.name):
                os.mkdir(item.name)
            os.chdir(item.name)

            # print the folder name
            print("-" * 80)
            print(f"\n\n{item.type}\t{item.id}\t{item.name}")
            print("-" * 80)

            download_items(item.get_items())
            os.chdir("..")

        if item.type == "file":
            print(f"{item.type}\t{item.id}\t{item.name}", end="")

            # check if item name ends with .tar.gz
            if item.name.endswith(".tar.gz"):
                print("\t .tar.gz skipped")
                continue

            with open(item.name, "wb") as download_file:
                item.download_to(download_file)
            print("\tdone")


if __name__ == "__main__":
    main()
    print("Done")

This one ends up being simpler, because it already has the context of the shared folder, and automatically handles that extra parameter/header.

Let us know if this helps.

@rbarbosa Many thanks!
Very detailed examples, and they run in your environment!

A few things I struggle with these examples, and that is where to obtain the dependencies:

  1. .jwt.config.json
  2. What %pip installs to run (I eventually figured this out for the first attempt)
  3. I did figure out CLIENT_ID, CLIENT_SECRET came from the dev console → OAUTH app …
  4. I don’t know where to start to find the .ccg.env and what might go in there
  5. CCG_USER_ID ? (20706451735)
  6. The .ccg.tk CACHE_FILE? Does the api library create and manage this?

I eventually just manually downloaded, uploaded, unzipped the file with the miscellaneous. The publisher provided a simple python script with python requests calls for the .tar.gz files.

Thanks again,
Douglas

Hi

The examples I’ve sent were adapted from previous questions, and that is why you find multiple security models, my apologies for the inconsistency.

Hopefully they run in your environment also :slight_smile:, let’s make sure that happens by taking a step back.

There are 3 types of authentication modes for Box applications:

  • OAuth 2.0 - Requires user to manually authorize the application
  • Client credential grants (CCG) - Requires client id, client secret, enterprise id or user id
  • JSON Web Tokens (JWT) - Requires client id, client secret, enterprise or user id, private key, private key passphrase, key id.

To use in a script my recommendation is to go for CCG, it is the easiest to set up and move forward, but you will need to activate your developer account. I’m not sure if you are using a corporate account, a free account or a free developer account.

For this exercise I recommend you create a free developer account, if you haven’t done so yet, and if you need to, later apply this to your other box account.

To create a CCG app, goto your developer console, and click create app, and select custom app:

On the second dialog select Server authentication (Client Credentials Grant)

This next step depends on what you need your app to do, but considering just downloads from a shared link, you need this:

Next you need to go through the authorization process. Flip to the authorization tab and press review and submit. This will submit the request to your box administrator, which in this free developer account is you.

Now go back to you box.com app and open your administrator console, select apps on the right side menu, and you should see your app pending authorization. Authorize the app.

Note: Remember to do this process (submit+authorize) every time you change the application configurations.

While you are at you administration console, go to account and billing and take note of your enterprise id. in my case:

Go back to the developer console and take not of the client id and client secret:

You now have everything you need to instantiate a CCG Client using the box SDK.
In my examples I tend to use a .ccg.env or just a .env file, and then use the python-dotenv to import these. You can also use them directly in your script, this is considered less secure, but it is up to you to evaluate that.

The .env file looks like this (macOS/Linux):

# Common settings
CLIENT_ID = YOUR_CLIENT_ID
CLIENT_SECRET = YOU_CLIENT_SECRET

# CCG Settings
ENTERPRISE_ID = YOUR_ENTERPRISE_ID
CCG_USER_ID = THE_USER_ID

You can ignore the CCG_USER_ID, since it is used if you want your script to act as a user as opposed to a service account)

You can also ignore the .ccg.tk cache file, it is used to cache the token and re-use it if still within the 60 minutes window, but for this application, I’m assuming getting a new token every time wont be an issue.

I’ve ignored the .tar.gz files because they were too big for a simple demo. You can remove the if statement to download everything.

Here is a revised version for the classic SDK:

"""demo to download files from a box web link"""

import os
import dotenv
from boxsdk import CCGAuth, Client

ENV_CCG = ".ccg.env"


class ConfigCCG:
    """application configurations"""

    def __init__(self) -> None:
        dotenv.load_dotenv(ENV_CCG)
        # Common configurations
        self.client_id = os.getenv("CLIENT_ID")
        self.client_secret = os.getenv("CLIENT_SECRET")

        # CCG configurations
        self.enterprise_id = os.getenv("ENTERPRISE_ID")


def main():
    conf = ConfigCCG()
    auth = CCGAuth(
        client_id=conf.client_id,
        client_secret=conf.client_secret,
        enterprise_id=conf.enterprise_id,
    )
    client = Client(auth)

    web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"

    user = client.user().get()
    print(f"User: {user.id}:{user.name}")

    shared_folder = client.get_shared_item(web_link_url, "")
    print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")
    print("#" * 80)

    print("Type\tID\t\tName")
    os.chdir("downloads")
    items = shared_folder.get_items()
    download_items(items)
    os.chdir("..")


def download_items(items):

    for item in items:
        if item.type == "folder":
            if not os.path.exists(item.name):
                os.mkdir(item.name)
            os.chdir(item.name)

            # print the folder name
            print("-" * 80)
            print(f"\n\n{item.type}\t{item.id}\t{item.name}")
            print("-" * 80)

            download_items(item.get_items())
            os.chdir("..")

        if item.type == "file":
            print(f"{item.type}\t{item.id}\t{item.name}", end="")

            # check if item name ends with .tar.gz
            if item.name.endswith(".tar.gz"):
                print("\t .tar.gz skipped")
                continue
            # comment the above block to download all files

            with open(item.name, "wb") as download_file:
                item.download_to(download_file)
            print("\tdone")


if __name__ == "__main__":
    main()
    print("Done")

To install the Box SDK’s, although optional, you should create an environment first.
For Box classic python SDK:

  • pip install boxsdk
    or
  • pip install "boxsdk[jwt]" - to include JWT support.

For the Next Gen Box SDK:

  • pip install box_sdk_gen
    or
  • pip install "box_sdk_gen[jwt]" - to include JWT support.

Let us know if this helps

Best regards