Preprocessing Datasets

Preprocessing Datasets#

from corprep import HyFI  # type: ignore

if HyFI.is_colab():
    HyFI.mount_google_drive()

prj = HyFI.init_project(
    project_name="corporate-reputation",
    project_root=HyFI.dotenv().DOTENV_DIR or ".",
    global_workspace_name="workspace",
    log_level="WARNING",
    verbose=True,
)

print("project directory:", prj.root_dir)
print("project workspace directory:", prj.workspace_dir)
INFO:hyfi.composer:Composing `ProjectConfig` class with `__init__` config in `project` group.
INFO:hyfi.composer:Composing `JobLibConfig` class with `__init__` config in `joblib` group.
INFO:hyfi.composer:Composing `PathConfig` class with `__init__` config in `path` group.
INFO:hyfi.joblib:initialized batcher with <hyfi.joblib.batch.batcher.Batcher object at 0x7f372c77b610>
project directory: /raid/cis/yjlee/workspace/projects/corporate-reputation
project workspace directory: /raid/cis/yjlee/workspace/projects/corporate-reputation/workspace
 /raid/cis/yjlee/workspace/projects/corporate-reputation
project workspace directory: /raid/cis/yjlee/workspace/projects/corporate-reputation/workspace
raw_news_dir = prj.path.glboal_dataset_dir / "raw/daum_news_20230707"
raw_data_files = []
if raw_news_dir.exists():
    raw_data_files = HyFI.get_filepaths(f"{raw_news_dir}/*.dat")
    print(raw_data_files[:3])
else:
    print("No raw data files found.")
['/home/yjlee/workspace/datasets/raw/daum_news_20230707/daum_news_2018.dat', '/home/yjlee/workspace/datasets/raw/daum_news_20230707/daum_news_2011.dat', '/home/yjlee/workspace/datasets/raw/daum_news_20230707/daum_news_2021.dat']