Preprocessing Datasets#
from corprep import HyFI # type: ignore
if HyFI.is_colab():
HyFI.mount_google_drive()
prj = HyFI.init_project(
project_name="corporate-reputation",
project_root=HyFI.dotenv().DOTENV_DIR or ".",
global_workspace_name="workspace",
log_level="WARNING",
verbose=True,
)
print("project directory:", prj.root_dir)
print("project workspace directory:", prj.workspace_dir)
INFO:hyfi.composer:Composing `ProjectConfig` class with `__init__` config in `project` group.
INFO:hyfi.composer:Composing `JobLibConfig` class with `__init__` config in `joblib` group.
INFO:hyfi.composer:Composing `PathConfig` class with `__init__` config in `path` group.
INFO:hyfi.joblib:initialized batcher with <hyfi.joblib.batch.batcher.Batcher object at 0x7f372c77b610>
project directory: /raid/cis/yjlee/workspace/projects/corporate-reputation
project workspace directory: /raid/cis/yjlee/workspace/projects/corporate-reputation/workspace
/raid/cis/yjlee/workspace/projects/corporate-reputation
project workspace directory: /raid/cis/yjlee/workspace/projects/corporate-reputation/workspace
raw_news_dir = prj.path.glboal_dataset_dir / "raw/daum_news_20230707"
raw_data_files = []
if raw_news_dir.exists():
raw_data_files = HyFI.get_filepaths(f"{raw_news_dir}/*.dat")
print(raw_data_files[:3])
else:
print("No raw data files found.")
['/home/yjlee/workspace/datasets/raw/daum_news_20230707/daum_news_2018.dat', '/home/yjlee/workspace/datasets/raw/daum_news_20230707/daum_news_2011.dat', '/home/yjlee/workspace/datasets/raw/daum_news_20230707/daum_news_2021.dat']