#!/usr/bin/env python3 import ast import asyncio import csv from decimal import Decimal from pathlib import Path from template_project.adapters.unit_of_work import DefaultUnitOfWork from template_project.application.common.embedding import Embedder from template_project.application.common.enums import ExperienceType from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding from template_project.ml.configuration import load_configuration as load_ml_configuration from template_project.ml.ioc.make import make_ioc as make_ml_ioc from template_project.web_api.configuration import load_configuration as load_backend_configuration from template_project.web_api.ioc.make import make_ioc as make_backend_ioc def parse_skills(skills_str: str) -> list[str]: try: skills = ast.literal_eval(skills_str) if isinstance(skills, list): return [str(skill) for skill in skills] return [] # noqa except (ValueError, SyntaxError): return [] def compose_embedding_text(position: str, description: str, key_skills: list[str]) -> str: skills_text = ", ".join(key_skills) if key_skills else "" parts = [position, description, skills_text] return " ".join(filter(None, parts)) async def main() -> None: backend_config_path = Path("config.toml") backend_configuration = load_backend_configuration(backend_config_path) backend_container = make_backend_ioc(backend_configuration) ml_config_path = Path("infrastructure/configs/ml/config.toml") ml_configuration = load_ml_configuration(ml_config_path) ml_container = make_ml_ioc(ml_configuration) csv_path = Path("filtered_vacancies.csv") max_records = 1000 try: async with backend_container() as backend_request_container, ml_container() as ml_request_container: unit_of_work = await backend_request_container.get(DefaultUnitOfWork) embedder = await ml_request_container.get(Embedder) print(f"Загружаю первые {max_records} вакансий из {csv_path}...") with csv_path.open("r", encoding="utf-8") as f: reader = csv.DictReader(f) batch_size = 50 batch = [] for idx, row in enumerate(reader): if idx >= max_records: break try: vacancy_id_str = row.get("vacancy_id", "").strip() if not vacancy_id_str: continue position = row.get("vacancy_nm", "").strip() if not position: continue experience_str = row.get("experience", "").strip() try: experience_type = ExperienceType(experience_str) except ValueError: continue salary_from_str = row.get("salary_from", "").strip() salary_to_str = row.get("salary_to", "").strip() try: salary_from = Decimal(salary_from_str) if salary_from_str else Decimal(0) salary_to = Decimal(salary_to_str) if salary_to_str else Decimal(0) except (ValueError, TypeError): continue description = row.get("vacancy_description", "").strip() key_skills = parse_skills(row.get("key_skills", "[]")) vacancy = Vacancy.factory( position=position, from_salary=salary_from, to_salary=salary_to, experience_type=experience_type, description=description, key_skills=key_skills, ) embedding_text = compose_embedding_text(position, description, key_skills) embedding_vector = await embedder.encode(embedding_text) embedding = VacancyEmbedding.factory( vacancy_id=vacancy.id, vector=embedding_vector, ) await unit_of_work.add(vacancy, embedding) batch.append((vacancy.id, position)) if len(batch) >= batch_size: await unit_of_work.commit() print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})") batch = [] except Exception as e: print(f"Ошибка при обработке строки {idx + 1}: {e}") continue if batch: await unit_of_work.commit() print(f"Загружено {len(batch)} вакансий (всего: {idx + 1})") print("Готово!") finally: await backend_container.close() await ml_container.close() if __name__ == "__main__": asyncio.run(main())