当前位置：首页 > news >正文

# Openneuro数据集下载指南（已成功）

news 2026/7/26 6:49:46

1.去Download中找

2.双击shell脚本

脚本下载的会失败或者部分会失败

3.这时候把上述shell脚本后缀改为txt,打开文本是以下curl的下载命令

4.在main文件夹创立一个analyze_curl_commands.py（注意工作目录应为py所在的文件夹）（需要修改input_file = “ds004856-1.2.0.txt”）

#!/usr/bin/env python3 import re import os import subprocess import time import csv from datetime import datetime from collections import defaultdict from pathlib import Path from urllib.parse import urlparse def create_download_log(filename="download_log.csv"): with open(filename, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['文件名', '下载时间', '成功状态', '备注']) return filename def update_download_log(filename, file_path, status, note, log_filename="download_log.csv"): timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') with open(log_filename, 'a', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow([file_path, timestamp, status, note]) def parse_curl_commands(input_file): curl_commands = [] with open(input_file, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): line = line.strip() if line.startswith('curl') and '-o ' in line: url_match = re.search(r'curl\s+--create-dirs\s+([^\s]+)', line) output_match = re.search(r'-o\s+([^\s]+)', line) if url_match and output_match: url = url_match.group(1) output_path = output_match.group(1) curl_commands.append({ 'url': url, 'output_path': output_path, 'line_num': line_num, 'original_line': line }) return curl_commands def check_dataset_folder(): dataset_path = Path('dataset') if not dataset_path.exists(): print("创建dataset文件夹...") dataset_path.mkdir() return False else: print("dataset文件夹已存在") return True def check_downloaded_files(curl_commands, log_filename="download_log.csv"): total_files = len(curl_commands) downloaded_files = 0 missing_files = [] directory_errors = [] for cmd in curl_commands: file_path = Path('dataset') / cmd['output_path'] expected_dir = file_path.parent if not expected_dir.exists(): directory_errors.append({ 'path': str(expected_dir), 'file': cmd['output_path'] }) if file_path.exists() and file_path.stat().st_size > 0: downloaded_files += 1 update_download_log(cmd['output_path'], cmd['output_path'], '已存在', '文件已存在于dataset中', log_filename) else: missing_files.append(cmd) update_download_log(cmd['output_path'], cmd['output_path'], '未下载', '文件不存在，待下载', log_filename) download_ratio = downloaded_files / total_files if total_files > 0 else 0 print(f"\n=== 下载进度统计 ===") print(f"总文件数量: {total_files}") print(f"已下载文件: {downloaded_files}") print(f"未下载文件: {len(missing_files)}") print(f"下载比率: {download_ratio:.2%}") print(f"目录错误数量: {len(directory_errors)}") return missing_files, directory_errors, download_ratio def download_file(url, output_path, current_index, total_files, max_retries=2, timeout_limit=180, log_filename="download_log.csv"): full_output_path = Path('dataset') / output_path full_output_path.parent.mkdir(parents=True, exist_ok=True) progress_percent = (current_index / total_files) * 100 for attempt in range(max_retries + 1): try: print(f"[{current_index}/{total_files}] ({progress_percent:.1f}%) 下载 {output_path} (尝试 {attempt + 1}/{max_retries + 1})") cmd = [ 'curl', '--create-dirs', '--silent', '--show-error', '--fail', '--location', '--output', str(full_output_path), url ] if timeout_limit: result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout_limit) else: result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0 and full_output_path.exists() and full_output_path.stat().st_size > 0: print(f"✓ [{current_index}/{total_files}] ({progress_percent:.1f}%) 下载成功: {output_path}") update_download_log(output_path, output_path, '成功', f'下载成功，尝试次数: {attempt + 1}', log_filename) return True else: print(f"✗ [{current_index}/{total_files}] ({progress_percent:.1f}%) 下载失败: {output_path} - {result.stderr}") except subprocess.TimeoutExpired: print(f"✗ [{current_index}/{total_files}] ({progress_percent:.1f}%) 下载超时({timeout_limit}秒): {output_path}") update_download_log(output_path, output_path, '超时', f'下载超时(超过{timeout_limit}秒)，尝试次数: {attempt + 1}', log_filename) return 'timeout' except Exception as e: print(f"✗ [{current_index}/{total_files}] ({progress_percent:.1f}%) 下载错误: {output_path} - {str(e)}") if attempt < max_retries: time.sleep(2) return False def generate_failed_downloads_file(failed_commands, filename="failed_downloads.txt"): with open(filename, 'w', encoding='utf-8') as f: for cmd in failed_commands: reason = cmd.get('failure_reason', '未知错误') f.write(f"# 失败原因: {reason}\n") f.write(cmd['original_line'] + '\n\n') print(f"未下载文件列表已保存到: {filename}") def analyze_curl_file(input_file): file_count = 0 directories = set() files_by_dir = defaultdict(list) print(f"正在分析文件: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): line = line.strip() if line.startswith('curl') and '-o ' in line: match = re.search(r'-o\s+([^\s]+)', line) if match: output_path = match.group(1) file_count += 1 dir_path = os.path.dirname(output_path) if dir_path: directories.add(dir_path) filename = os.path.basename(output_path) files_by_dir[dir_path].append(filename) if file_count % 1000 == 0: print(f"已处理 {file_count} 个文件...") print(f"\n分析完成!") print(f"总文件数量: {file_count}") print(f"总目录数量: {len(directories)}") return file_count, directories, files_by_dir def generate_directory_structure(directories, files_by_dir, output_file): print(f"\n正在生成目录结构文件: {output_file}") sorted_dirs = sorted(directories) sub_dirs = set() for d in directories: parts = d.split('/') for part in parts: if part.startswith('sub-'): sub_dirs.add(part) break sub_count = len(sub_dirs) with open(output_file, 'w', encoding='utf-8') as f: f.write("## 统计信息\n") f.write(f"- 总文件数量: {sum(len(files) for files in files_by_dir.values())}\n") f.write(f"- 总目录数量: {len(directories)}\n") f.write(f"- sub目录数量: {sub_count}\n\n") f.write("## 目录结构\n\n") # 生成树状结构 root_dirs = defaultdict(list) for dir_path in sorted_dirs: parts = dir_path.split('/') if len(parts) > 1: root_dirs[parts[0]].append(dir_path) else: root_dirs[dir_path] = [] for root_dir in sorted(root_dirs.keys()): f.write(f"### {root_dir}/\n") # 获取该根目录下的所有子目录 subdirs = [d for d in sorted_dirs if d.startswith(root_dir + '/') or d == root_dir] # 构建树状结构 dir_tree = {} for subdir in subdirs: parts = subdir.split('/') current = dir_tree for part in parts: if part not in current: current[part] = {} current = current[part] # 递归打印树状结构 def print_tree(tree, prefix="", is_root=True): items = list(tree.items()) for i, (name, subtree) in enumerate(items): is_last = (i == len(items) - 1) if is_root: current_prefix = "" next_prefix = "" else: connector = "└── " if is_last else "├── " f.write(f"{prefix}{connector}{name}/\n") next_prefix = prefix + (" " if is_last else "│ ") # 显示该目录下的文件 dir_path = '/'.join([p for p in [prefix.replace('│ ', '').replace('├── ', '').replace('└── ', '').replace(' ', ''), name] if p]) if not is_root: # 构建完整路径 full_path = root_dir if dir_path.strip(): full_path += '/' + dir_path.strip() if full_path in files_by_dir: files = sorted(files_by_dir[full_path]) for j, filename in enumerate(files): is_last_file = (j == len(files) - 1) file_connector = "└── " if is_last_file else "├── " f.write(f"{next_prefix}{file_connector}{filename}\n") if subtree and not is_root: print_tree(subtree, next_prefix, False) # 处理根目录的文件 if root_dir in files_by_dir: files = sorted(files_by_dir[root_dir]) for filename in files: f.write(f"├── {filename}\n") # 打印子目录 if root_dir in dir_tree: print_tree(dir_tree[root_dir], "", False) f.write("\n") f.write("\n## 创建目录结构的Shell命令\n\n") f.write("```bash\n") f.write("#!/bin/bash\n\n") for dir_path in sorted_dirs: f.write(f"mkdir -p \"{dir_path}\"\n") f.write("\necho \"目录结构创建完成!\"\n") f.write("```\n") print(f"目录结构文件已生成: {output_file}") def main(input_file, output_file): if not os.path.exists(input_file): print(f"错误: 找不到输入文件 {input_file}") return print("=== 数据集下载管理器 ===") print("\n=== 文件整理提示 ===") print(" 请按照以下步骤整理已下载的文件:") print("1. 将已下载的数据移动至 dataset/ds004856-1.2.0/") print("2. 没有文件则从头开始下载") print("\n查看详细的目录结构请参考: ds004856_directory_structure.txt") log_filename = create_download_log() print(f"已创建下载日志文件: {log_filename}") print("查看下载日志请参考: download_log.csv") print("\n警告: 默认单个文件下载超时限制为3分钟") print("如果网络较慢或文件较大，可能导致下载失败") while True: user_input = input("\n是否解除3分钟下载超时限制？(y/n): ").strip().lower() if user_input in ['y', 'yes', '是']: timeout_limit = None print("已解除超时限制，下载将不会因时间限制而中断") break elif user_input in ['n', 'no', '否']: timeout_limit = 180 print("保持3分钟超时限制") break else: print("请输入 y 或 n") dataset_exists = check_dataset_folder() print("\n解析curl命令文件...") curl_commands = parse_curl_commands(input_file) print(f"找到 {len(curl_commands)} 个下载命令") if dataset_exists: print("\n检查已下载文件...") missing_files, directory_errors, download_ratio = check_downloaded_files(curl_commands, log_filename) if directory_errors: print(f"\n发现 {len(directory_errors)} 个目录错误:") for error in directory_errors[:5]: print(f" 缺少目录: {error['path']}") if missing_files: print(f"\n开始下载 {len(missing_files)} 个缺失文件...") download_files = missing_files else: print("\n所有文件已下载完成!") return else: print("\n开始从头下载所有文件...") download_files = curl_commands for cmd in curl_commands: update_download_log(cmd['output_path'], cmd['output_path'], '待下载', '新建dataset，准备下载', log_filename) failed_downloads = [] success_count = 0 for i, cmd in enumerate(download_files, 1): result = download_file(cmd['url'], cmd['output_path'], i, len(download_files), timeout_limit=timeout_limit, log_filename=log_filename) if result == True: success_count += 1 else: cmd_copy = cmd.copy() if result == 'timeout': cmd_copy['failure_reason'] = f'下载超时(超过{timeout_limit}秒)' if timeout_limit else '下载超时' elif result == 'failed': cmd_copy['failure_reason'] = '下载失败' failed_downloads.append(cmd_copy) if i % 10 == 0: current_ratio = success_count / i print(f"\n当前成功率: {current_ratio:.2%} ({success_count}/{i})") print(f"剩余文件: {len(download_files) - i}") print(f"\n=== 下载完成 ===") print(f"成功下载: {success_count}") print(f"下载失败: {len(failed_downloads)}") if failed_downloads: failed_file = "failed_downloads.txt" generate_failed_downloads_file(failed_downloads, failed_file) print(f"失败的下载命令已保存到: {failed_file}") print("\n生成目录结构分析...") file_count, directories, files_by_dir = analyze_curl_file(input_file) generate_directory_structure(directories, files_by_dir, output_file) print(f"\n=== 最终统计 ===") print(f"总文件数量: {file_count}") print(f"总目录数量: {len(directories)}") print(f"成功下载: {success_count}") print(f"下载失败: {len(failed_downloads)}") print(f"目录结构已保存到: {output_file}") if __name__ == "__main__": input_file = "ds004856-1.2.0.txt" output_file = "ds004856_directory_structure.txt" main(input_file, output_file)

5.在py同级目录下创立一个txt文件，改为bat文件，一键运行即可

@echo off setlocal enabledelayedexpansion echo Starting batch file... echo. echo Checking Python installation... python --version 2>nul if errorlevel 1 ( echo [ERROR] Python interpreter not found echo Please ensure Python 3.x is installed and added to system PATH pause exit /b 1 ) echo [OK] Python found echo Checking curl installation... curl --version 2>nul if errorlevel 1 ( echo [ERROR] curl command not found echo Please install curl tool or use Windows 10/11 built-in version pause exit /b 1 ) echo [OK] curl found echo Checking network connectivity... ping -n 1 baidu.com >nul 2>&1 if errorlevel 1 ( echo [ERROR] Cannot connect to baidu.com - Network connectivity issue echo Please check your internet connection pause exit /b 1 ) echo [OK] Network connectivity to baidu.com verified ping -n 1 google.com >nul 2>&1 if errorlevel 1 ( echo [WARNING] Cannot connect to google.com - but baidu.com is accessible echo Continuing with baidu.com connectivity... ) else ( echo [OK] Network connectivity to google.com verified ) echo Checking main program file... if not exist "analyze_curl_commands.py" ( echo [ERROR] Main program file analyze_curl_commands.py not found echo Please ensure this file exists in current directory pause exit /b 1 ) echo [OK] Main program file found echo Checking curl command file... if not exist "ds004856-1.2.0.txt" ( echo [ERROR] curl command file ds004856-1.2.0.txt not found echo. echo Please prepare dataset following these steps: echo 1. Download ds004856-1.2.0 dataset curl command file from OpenNeuro echo 2. Rename the file to ds004856-1.2.0.txt echo 3. Place the file in current directory echo 4. Re-run this script echo. pause exit /b 1 ) echo [OK] curl command file found echo. echo [SUCCESS] All environment checks passed! echo. echo [INFO] Dataset Information: echo Input file: ds004856-1.2.0.txt echo Output directory: dataset\ echo. set /p "choice=Start downloading dataset? (y/n): " if /i not "%choice%"=="y" ( echo Download cancelled pause exit /b 0 ) echo. echo [START] Running dataset download manager... echo ====================================== echo. python analyze_curl_commands.py if errorlevel 1 ( echo. echo [ERROR] Program execution failed pause exit /b 1 ) else ( echo. echo ====================================== echo [SUCCESS] Download manager completed! echo ====================================== echo. echo [NOTICE] File Organization Instructions: echo 1. Check downloaded files in dataset\ds004856-1.2.0\ directory echo 2. Move files to correct subdirectories according to dataset structure echo 3. Ensure file naming follows BIDS standard format echo 4. Refer to dataset_description.json and README files for structure info echo 5. Manually rename or move any duplicate or incorrectly named files echo. echo [SUGGESTION] Use the following command to check file structure: echo dir /s dataset\ds004856-1.2.0\ echo. ) echo. pause

查看全文

http://www.jsqmd.com/news/950418/