mirror of
https://github.com/syssec-utd/pylingual.git
synced 2026-05-11 02:40:13 -07:00
61 lines
2.2 KiB
Python
61 lines
2.2 KiB
Python
from io import BytesIO
|
|
from typing import Dict, List, Literal
|
|
|
|
from datasets import load_dataset
|
|
from huggingface_hub import HfApi
|
|
|
|
from .DatasetDescription import DatasetDescription
|
|
|
|
LOCAL_DATASET = Dict[Literal["train", "test", "valid"], List[str]]
|
|
|
|
|
|
def upload_single_dataset(data_files: LOCAL_DATASET, dataset_name: str, dataset_card: str):
|
|
local_datasets = load_dataset("csv", data_files=data_files)
|
|
local_datasets.push_to_hub(dataset_name, private=True)
|
|
|
|
dataset_card_with_stats = dataset_card + f"\n\nDataset Statistics:\n\n```\n{local_datasets}\n```"
|
|
|
|
api = HfApi()
|
|
api.upload_file(
|
|
path_or_fileobj=BytesIO(bytes(dataset_card_with_stats, "utf-8")),
|
|
path_in_repo="README.md",
|
|
repo_id=dataset_name,
|
|
repo_type="dataset",
|
|
)
|
|
|
|
|
|
def upload_dataset_to_huggingface(dataset_description: DatasetDescription):
|
|
formatted_data_requests = "\n".join(f"{str(req.source_path.resolve())}: (train: {req.num_train}, test: {req.num_test}, valid: {req.num_valid})" for req in dataset_description.data_requests)
|
|
dataset_card = f"""
|
|
# {dataset_description.name}
|
|
|
|
Created by the Syssec team @ UTD
|
|
|
|
Dataset Composition:
|
|
|
|
```
|
|
{formatted_data_requests}
|
|
```
|
|
|
|
Python version: `{".".join(map(str, dataset_description.version))}`
|
|
"""
|
|
|
|
splits: List[Literal["train", "test", "valid"]] = [
|
|
"train",
|
|
"test",
|
|
"valid",
|
|
]
|
|
|
|
# collect data files
|
|
segmentation_data_files: LOCAL_DATASET = {}
|
|
statement_data_files: LOCAL_DATASET = {}
|
|
for split in splits:
|
|
segmentation_data_files[split] = [str(path.resolve()) for path in (dataset_description.csv_dir / split / "segmentation").glob("*.csv")]
|
|
statement_data_files[split] = [str(path.resolve()) for path in (dataset_description.csv_dir / split / "statement").glob("*.csv")]
|
|
|
|
# upload datasets
|
|
segmentation_dataset_name = f"{dataset_description.huggingface_user}/segmentation-{dataset_description.name}"
|
|
upload_single_dataset(segmentation_data_files, segmentation_dataset_name, dataset_card)
|
|
statement_dataset_name = f"{dataset_description.huggingface_user}/statement-{dataset_description.name}"
|
|
upload_single_dataset(statement_data_files, statement_dataset_name, dataset_card)
|