You've already forked RekomenciBackend
129 lines
5.2 KiB
Python
Executable File
129 lines
5.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import ast
|
|
import asyncio
|
|
import csv
|
|
from decimal import Decimal
|
|
from pathlib import Path
|
|
|
|
from template_project.adapters.unit_of_work import DefaultUnitOfWork
|
|
from template_project.application.common.embedding import Embedder
|
|
from template_project.application.common.enums import ExperienceType
|
|
from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding
|
|
from template_project.ml.configuration import load_configuration as load_ml_configuration
|
|
from template_project.ml.ioc.make import make_ioc as make_ml_ioc
|
|
from template_project.web_api.configuration import load_configuration as load_backend_configuration
|
|
from template_project.web_api.ioc.make import make_ioc as make_backend_ioc
|
|
|
|
|
|
def parse_skills(skills_str: str) -> list[str]:
|
|
try:
|
|
skills = ast.literal_eval(skills_str)
|
|
if isinstance(skills, list):
|
|
return [str(skill) for skill in skills]
|
|
return [] # noqa
|
|
except (ValueError, SyntaxError):
|
|
return []
|
|
|
|
|
|
def compose_embedding_text(position: str, description: str, key_skills: list[str]) -> str:
|
|
skills_text = ", ".join(key_skills) if key_skills else ""
|
|
parts = [position, description, skills_text]
|
|
return " ".join(filter(None, parts))
|
|
|
|
|
|
async def main() -> None:
|
|
backend_config_path = Path("config.toml")
|
|
backend_configuration = load_backend_configuration(backend_config_path)
|
|
backend_container = make_backend_ioc(backend_configuration)
|
|
|
|
ml_config_path = Path("infrastructure/configs/ml/config.toml")
|
|
ml_configuration = load_ml_configuration(ml_config_path)
|
|
ml_container = make_ml_ioc(ml_configuration)
|
|
|
|
csv_path = Path("filtered_vacancies.csv")
|
|
max_records = 1000
|
|
|
|
try:
|
|
async with backend_container() as backend_request_container, ml_container() as ml_request_container:
|
|
unit_of_work = await backend_request_container.get(DefaultUnitOfWork)
|
|
embedder = await ml_request_container.get(Embedder)
|
|
|
|
print(f"Загружаю первые {max_records} вакансий из {csv_path}...")
|
|
|
|
with csv_path.open("r", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
batch_size = 50
|
|
batch = []
|
|
|
|
for idx, row in enumerate(reader):
|
|
if idx >= max_records:
|
|
break
|
|
|
|
try:
|
|
vacancy_id_str = row.get("vacancy_id", "").strip()
|
|
if not vacancy_id_str:
|
|
continue
|
|
|
|
position = row.get("vacancy_nm", "").strip()
|
|
if not position:
|
|
continue
|
|
|
|
experience_str = row.get("experience", "").strip()
|
|
try:
|
|
experience_type = ExperienceType(experience_str)
|
|
except ValueError:
|
|
continue
|
|
|
|
salary_from_str = row.get("salary_from", "").strip()
|
|
salary_to_str = row.get("salary_to", "").strip()
|
|
try:
|
|
salary_from = Decimal(salary_from_str) if salary_from_str else Decimal(0)
|
|
salary_to = Decimal(salary_to_str) if salary_to_str else Decimal(0)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
|
|
description = row.get("vacancy_description", "").strip()
|
|
key_skills = parse_skills(row.get("key_skills", "[]"))
|
|
|
|
vacancy = Vacancy.factory(
|
|
position=position,
|
|
from_salary=salary_from,
|
|
to_salary=salary_to,
|
|
experience_type=experience_type,
|
|
description=description,
|
|
key_skills=key_skills,
|
|
)
|
|
|
|
embedding_text = compose_embedding_text(position, description, key_skills)
|
|
embedding_vector = await embedder.encode(embedding_text)
|
|
|
|
embedding = VacancyEmbedding.factory(
|
|
vacancy_id=vacancy.id,
|
|
vector=embedding_vector,
|
|
)
|
|
|
|
await unit_of_work.add(vacancy, embedding)
|
|
batch.append((vacancy.id, position))
|
|
|
|
if len(batch) >= batch_size:
|
|
await unit_of_work.commit()
|
|
print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})")
|
|
batch = []
|
|
|
|
except Exception as e:
|
|
print(f"Ошибка при обработке строки {idx + 1}: {e}")
|
|
continue
|
|
|
|
if batch:
|
|
await unit_of_work.commit()
|
|
print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})")
|
|
|
|
print("Готово!")
|
|
finally:
|
|
await backend_container.close()
|
|
await ml_container.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|