Best approach to find and download files from a large folder list

a partners is sharing files with us through a account. The files are store in folder tree. There are over 25K of files stored. We receive an xls files containing only filenames. We need to download files based on a filename but I do not know in which folder the file resides. I have made some attempts in python to search and download files, but it looks like my approach is incorrect or I am hitting API rate limits. I have noticed that if I have the file ID I can easily download the files, but I only have the filenames. What would be the best approach for me to create a automated solution using python and the API? Would it be easy for the partner to give me the file_ids? Is there an easy way to retrieve those from all files that are uploaded? Or is there a fast way to search based on filename? I would love to hear what your approach would be.

Hi @edztra

The most efficient option is for the partner to include the file_id or the shared link in the xls file.
You can directly download using a file_id:
For a shared link, you first convert it to a file id and then download it.

Another option is to use search. You can search by file name and only in the file name
There are some disadvantages here, the search may not be up to date, although in your use case that might be unlikely, also the search is always fuzzy, meaning it tends to return not an exact match, make sure you check if the file name is exactly the same of what you’re looking for.

Check out the search workshop:

The last option is to crawl the folders and look for the specified file.
This is the most inefficient, will take a long time, and will consume a lot of API calls.

Let us know if this helps, and if you want to discuss a particular option.


thanks Rui!
The partner is only providing the name (without an extension) I create the script below which work ok so far. I don’t think my Token Refresh will work, but I will check that later. And I don’t think my searches will take longer than an hour. I was wondering how many simultaneous downloads I can trigger, I am now running 10 threads. Is that within the rate limits? Thanks for you help.

My script

#!/usr/bin/env python3

import os
import pandas as pd
from pandas import isna
from boxsdk import OAuth2, Client
import dotenv
from datetime import datetime
import logging
import traceback
import threading
import sys
from threading import Semaphore
from tqdm import tqdm

# Global lock for DataFrame and logging
df_lock = threading.Lock()  # Lock for thread safety when updating DataFrame

# Configure logging
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch = logging.StreamHandler()
current_date ="%d_%m_%Y")
log_filename = f"log_{current_date}.log"
fh = logging.FileHandler(log_filename)  

download_count = 0  # A counter for downloaded files
download_count_lock = threading.Lock()  # Lock for thread safety when updating the counter

def authenticate_box_client():
    client_id = os.getenv('BOX_CLIENT_ID')
    client_secret = os.getenv('BOX_CLIENT_SECRET')
    access_token = os.getenv('BOX_ACCESS_TOKEN')
    refresh_token = os.getenv('BOX_REFRESH_TOKEN')  # Load the refresh token

        auth = OAuth2(
            refresh_token=refresh_token,  # Include the refresh token
        return Client(auth)
    except Exception as e:
        logging.error(f"Authentication failed: {e}")
        raise e

def store_tokens(access_token, refresh_token):
    dotenv_path = dotenv.find_dotenv()
    if not dotenv_path:
        logging.warning("No .env file found.")

    dotenv.set_key(dotenv_path, 'BOX_ACCESS_TOKEN', access_token)
    dotenv.set_key(dotenv_path, 'BOX_REFRESH_TOKEN', refresh_token)
    logging.debug(f"New access token and refresh token have been stored securely.")

def check_required_environment_vars():
    for var in required_vars:
        if os.getenv(var) is None:
            logging.critical(f"Missing '{var}' in environment.")
            raise EnvironmentError(f"Missing '{var}' in environment.")

def search_file(client, filename, folder_id):
    search_results =
    found_files = []
    counter = 0
    for item in search_results:
        counter += 1
        if == filename.lower():
        if counter >= 2:
    return found_files

def update_dataframe(df, index, status, excel_file_path=None, acquire_lock=True):
        if acquire_lock:
        # old_status =[index, 'Status'][index, 'Status'] = status

        if excel_file_path:
            df.to_excel(excel_file_path, index=False)
            #"DataFrame saved to Excel at {excel_file_path}")

        if acquire_lock:
    except Exception as e:
        if acquire_lock:
        logging.error(f"Error in update_dataframe(): {e}")
        print(f"Error in update_dataframe(): {e}")

def find_and_download_file(client, filename, folder_id, download_path):
    found_files = search_file(client, filename, folder_id)
    if not found_files:
        logging.error(f"File {filename} not found.")
        return "Not Found"
    file_to_download = found_files[0]
    download_successful = download_file(client, file_to_download, download_path)
    if download_successful:
        return "Downloaded"
        return "Download Error"

def download_file(client, file_to_download, download_path):
    global download_count  # Declare the variable as global
    global download_count_lock  # Declare the lock as global

    try:"Found file {}. Downloading ...")
        item_download_path = os.path.join(download_path,
        with open(item_download_path, 'wb') as f:
            file_to_download.download_to(f)"Download completed for {}.")
        with download_count_lock:  # Lock the counter while updating it
            download_count += 1  # Increment the counter
        return True
    except Exception as e:
        logging.error(f"Failed to download file: {e}")
        return False

def process_single_file(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock):    
    with semaphore:
  "Thread for {filename} started.")
            downloaded = find_and_download_file(client, filename, folder_id, download_path)
            status = downloaded if downloaded in ["Not Found", "Downloaded"] else "Download Error"
            with df_lock:
                update_dataframe(df, index, status, excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)

            with pbar_lock:

        except Exception as e:
            logging.error(f"Thread for {filename} encountered an error: {e}")
            with df_lock:
                update_dataframe(df, index, "Error", excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)

from tqdm import tqdm

def main():
        start_time =
        global download_count  # Use the global download_count variable
        download_count = 0  # Reset the counter before starting
        client = authenticate_box_client()
        folder_id = os.getenv('FOLDER_ID')
        excel_file_path = os.getenv('EXCEL_FILE_PATH')
        df = pd.read_excel(excel_file_path)
        # Filter the DataFrame to only include rows where 'Status' is NaN or 'Not Found'
        filtered_df = df[(isna(df['Status'])) | (df['Status'] == 'Not Found')]

        current_time ="%Y-%m-%d")
        download_path = f"Download_folder_{current_time}"
        os.makedirs(download_path, exist_ok=True)

        max_threads = 10  # Set the maximum number of simultaneous threads
        semaphore = Semaphore(max_threads)
        # Initialize tqdm progress bar
        pbar = tqdm(total=len(filtered_df), desc="Processing files", dynamic_ncols=True)
        pbar_lock = threading.Lock()

        threads = []
        for index, row in filtered_df.iterrows():
            filename = row['Filename']
            t = threading.Thread(target=process_single_file, args=(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock))
            t.daemon = True

        for t in threads:
            if t.is_alive():
                logging.error(f"Thread for {t.filename} did not complete in time.")

        # Close the progress bar

        # Save DataFrame to Excel
            df.to_excel(excel_file_path, index=False)
            df_saved = pd.read_excel(excel_file_path)
            if not df.equals(df_saved):
                logging.warning("DataFrame might not have saved correctly.")
      "DataFrame saved correctly.")    
        except Exception as e:
            logging.error(f"Could not write to Excel file: {e}")

    except FileNotFoundError as e:
        logging.error(f"A file was not found: {e}")
    except Exception as e:
        logging.critical(f"An unexpected error occurred: {e}")
        if hasattr(e, 'context_info'):
            logging.critical(f"Context Info: {e.context_info}")
    except KeyboardInterrupt:"Exiting due to manual interruption.")

    end_time =
    elapsed_time = end_time - start_time"Total time elapsed: {elapsed_time}")"Total number of files downloaded: {download_count}")

if __name__ == '__main__':