async def load_file(path: str, group: dict | None) -> pd.DataFrame:
if "id" not in data.columns:
data["id"] = data.apply(lambda x: gen_md5_hash(x, x.keys()), axis=1)
# 获取指定的source列,并保存为source列
if csv_config.source_column is not None and "source" not in data.columns:
data["source"] = data.apply(
lambda x: x[csv_config.source_column], axis=1
if csv_config.text_column is not None and "text" not in data.columns:
data["text"] = data.apply(lambda x: x[csv_config.text_column], axis=1)
# 获取指定的title_column并将其保存为tilte列
if csv_config.title_column is not None and "title" not in data.columns:
data["title"] = data.apply(lambda x: x[csv_config.title_column], axis=1)
# 获取指定的时间列,处理时间列timestamp_column
if csv_config.timestamp_column is not None:
data["timestamp"] = pd.to_datetime(
data[csv_config.timestamp_column], format=fmt