From effbcfbc2daa635eb9c4ef35283887facb2c9e3f Mon Sep 17 00:00:00 2001 From: ivankirpichnikov Date: Sat, 22 Nov 2025 18:15:29 +0300 Subject: [PATCH] =?UTF-8?q?=D0=BD=D0=B5=20=D0=BF=D0=BE=D0=BC=D0=BD=D0=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dataset/__main__.py | 59 ++++++++++++------- src/dataset/upload_key_skills.py | 37 ++++++++++++ .../adapters/data_gateways/key_skills.py | 2 +- .../adapters/data_gateways/resume.py | 6 +- .../adapters/data_gateways/tables.py | 24 +++++++- .../adapters/data_gateways/vacancy.py | 24 ++++++++ src/template_project/adapters/ml_client.py | 12 ++++ .../adapters/vector_generators/resume.py | 36 +++++++++++ .../application/resume/data_gateway.py | 5 -- .../resume/resume_embedding_pipeline.py | 29 +++++---- .../application/resume/resume_prediction.py | 1 + .../application/resume/vector_generator.py | 6 +- .../application/vacancy/data_gateway.py | 11 ++++ .../ml/interactors/predict_salary.py | 5 +- src/template_project/ml/ioc/interactor.py | 1 - src/template_project/ml/routes/predict.py | 4 +- .../web_api/ioc/data_gateway.py | 2 + 17 files changed, 210 insertions(+), 54 deletions(-) create mode 100644 src/dataset/upload_key_skills.py create mode 100644 src/template_project/adapters/data_gateways/vacancy.py create mode 100644 src/template_project/adapters/ml_client.py create mode 100644 src/template_project/adapters/vector_generators/resume.py create mode 100644 src/template_project/application/resume/resume_prediction.py create mode 100644 src/template_project/application/vacancy/data_gateway.py diff --git a/src/dataset/__main__.py b/src/dataset/__main__.py index 2748afd..3d9d52c 100644 --- a/src/dataset/__main__.py +++ b/src/dataset/__main__.py @@ -1,30 +1,49 @@ import json +import logging +import urllib.parse +from collections.abc import Sequence from pathlib import Path +from typing import Final from adaptix import DebugTrail, NameStyle, Retort, name_mapping from dataset.data_structures import DataSetLine, Salary +from dataset.upload_key_skills import upload_key_kills -retort = Retort( - recipe=[ - name_mapping(Salary, name_style=NameStyle.CAMEL), - ], - debug_trail=DebugTrail.DISABLE, - strict_coercion=False, -) +DATASET_PATH: Final = Path("hh_ru_vacancies.jsonlines") +BASE_URL: Final = "https://team-39-alpha-gm5qjkou.hack.prodcontest.ru" +UPLOAD_KEY_SKILLS: Final = urllib.parse.urljoin(BASE_URL, "key_skills") -raw_lines = [] -with Path("hh_ru_vacancies.jsonlines").open("r", encoding="utf-8") as f: - raw_lines = map(json.loads, f.readlines()) +logger = logging.getLogger(__name__) -lines = retort.load(raw_lines, list[DataSetLine]) -f = set() -c = 0 -for line in lines: - if c == 1000: - break - if line.experience: - f.add(line.experience) - c += 0 -print(f) +def parse_dataset(file_path: Path) -> Sequence[DataSetLine]: + retort = Retort( + recipe=[ + name_mapping(Salary, name_style=NameStyle.CAMEL), + ], + debug_trail=DebugTrail.DISABLE, + strict_coercion=False, + ) + + raw_lines = [] + with file_path.open("r", encoding="utf-8") as f: + raw_lines = map(json.loads, f.readlines()) + + return retort.load(raw_lines, Sequence[DataSetLine]) + + +def main() -> None: + logging.basicConfig(level=logging.INFO) + + logger.info("Parsing dataset...") + + dataset = parse_dataset(DATASET_PATH) + + upload_key_kills(dataset, UPLOAD_KEY_SKILLS) + + logger.info("finished script") + + +if __name__ == "__main__": + main() diff --git a/src/dataset/upload_key_skills.py b/src/dataset/upload_key_skills.py new file mode 100644 index 0000000..6abc1cf --- /dev/null +++ b/src/dataset/upload_key_skills.py @@ -0,0 +1,37 @@ +import logging +from collections.abc import Sequence + +from requests import Session + +from dataset.data_structures import DataSetLine + +logger = logging.getLogger(__name__) + + +def upload_key_kills( + dataset: Sequence[DataSetLine], + upload_endpoint: str, + max_upload_count: int | None = None +) -> None: + session = Session() + + key_skills = [] + for count, line in enumerate(dataset): + if max_upload_count is not None and count >= max_upload_count: + break + + key_skills.extend(line.key_skills) + + logger.info("Upload skills %r", key_skills) + response = session.post( + upload_endpoint, + json={ + "key_skills": key_skills + } + ) + if response.status_code != 200: + logger.warning("Doesn't upload skills. Status code %r", response.status_code) + else: + logger.info("Upload skills %r. Status code %r", key_skills, response.status_code) + + key_skills = [] diff --git a/src/template_project/adapters/data_gateways/key_skills.py b/src/template_project/adapters/data_gateways/key_skills.py index c4d5ad1..3039d68 100644 --- a/src/template_project/adapters/data_gateways/key_skills.py +++ b/src/template_project/adapters/data_gateways/key_skills.py @@ -15,7 +15,7 @@ class KeySkillsDataGateway: async def query(self, query: str) -> Sequence[str]: statement = ( select(key_skills_table.c.name) - .where(key_skills_table.c.name.ilike(f"{query}%")) + .where(key_skills_table.c.name.ilike(f"%{query}%")) .order_by(key_skills_table.c.name) .limit(30) ) diff --git a/src/template_project/adapters/data_gateways/resume.py b/src/template_project/adapters/data_gateways/resume.py index 50256e6..c84cf67 100644 --- a/src/template_project/adapters/data_gateways/resume.py +++ b/src/template_project/adapters/data_gateways/resume.py @@ -6,7 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession from template_project.adapters.data_gateways.tables import resume_prediction_table, resume_table from template_project.application.resume.data_gateway import ResumeDataGateway, ResumePredictionDataGateway -from template_project.application.resume.entity import Resume, ResumeEmbeddingId, ResumeId, ResumePrediction +from template_project.application.resume.entity import Resume, ResumeId, ResumePrediction from template_project.application.resume.errors import ResumeNotFoundError from template_project.application.user.entity import UserId @@ -15,10 +15,6 @@ class DefaultResumeDataGateway(ResumeDataGateway): def __init__(self, session: AsyncSession) -> None: self._session = session - @override - async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]: - raise NotImplementedError - @override async def load(self, resume_id: ResumeId) -> Resume: resume = await self._session.get(Resume, resume_id) diff --git a/src/template_project/adapters/data_gateways/tables.py b/src/template_project/adapters/data_gateways/tables.py index 10f3725..15aaf61 100644 --- a/src/template_project/adapters/data_gateways/tables.py +++ b/src/template_project/adapters/data_gateways/tables.py @@ -3,7 +3,6 @@ from typing import Any, Final, override from pgvector.sqlalchemy import Vector from sqlalchemy import ( - ARRAY, Boolean, Column, DateTime, @@ -23,6 +22,7 @@ from sqlalchemy.orm import registry from template_project.application.access_token.entity import AccessToken from template_project.application.auth_identity.entity import AuthIdentity, AuthMethod +from template_project.application.common.enums import ExperienceType from template_project.application.notification_device.entity import NotificationDevice from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction from template_project.application.user.entity import User @@ -160,6 +160,28 @@ key_skills_table: Final = Table( Column("id", Integer, autoincrement=True, primary_key=True), Column("name", String, nullable=False, unique=True) ) +vacancy_table: Final = Table( + "vacancies", + meta_data, + Column("id", UUID, primary_key=True), + Column("deleted_at", DateTime(timezone=True)), + Column("created_at", DateTime(timezone=True), nullable=False), + Column("position", String, nullable=False), + Column("from_salary", Numeric, nullable=False), + Column("to_salary", Numeric, nullable=False), + Column("experience_type", Enum(ExperienceType), nullable=False), + Column("description", nullable=False), + Column("key_skills", nullable=False), +) +vacancy_embedding_table: Final = Table( + "vacancy_embedding", + meta_data, + Column("id", UUID, primary_key=True), + Column("deleted_at", DateTime(timezone=True)), + Column("created_at", DateTime(timezone=True), nullable=False), + Column("vacancy_id", UUID, ForeignKey("vacancies.id", ondelete="CASCADE"), nullable=False), + Column("vector", Vector, nullable=False), +) mapper_registry.map_imperatively(User, user_table) diff --git a/src/template_project/adapters/data_gateways/vacancy.py b/src/template_project/adapters/data_gateways/vacancy.py new file mode 100644 index 0000000..9e52c53 --- /dev/null +++ b/src/template_project/adapters/data_gateways/vacancy.py @@ -0,0 +1,24 @@ +from collections.abc import Sequence +from typing import override + +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from template_project.adapters.data_gateways.tables import vacancy_embedding_table, vacancy_table +from template_project.application.vacancy.data_gateway import VacancyDataGateway +from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding + + +class DefaultVacancyDataGateway(VacancyDataGateway): + def __init__(self, session: AsyncSession) -> None: + self._session = session + + @override + async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]: + statement = ( + select(Vacancy) + .join(VacancyEmbedding, vacancy_embedding_table.c.id == vacancy_table.c.id) + .where(vacancy_embedding_table.c.vector.cosine_distance(vector) > 0.5) + ) + result = await self._session.execute(statement) + return result.scalars().all() diff --git a/src/template_project/adapters/ml_client.py b/src/template_project/adapters/ml_client.py new file mode 100644 index 0000000..4bfd12f --- /dev/null +++ b/src/template_project/adapters/ml_client.py @@ -0,0 +1,12 @@ +from typing import cast + +from httpx import AsyncClient + + +class MlApiGateway: + def __init__(self, client: AsyncClient) -> None: + self._client = client + + async def generate_embedding(self, text: str) -> list[float]: + response = await self._client.post("/get_embedding", json={"text": text}) + return cast(list[float], response.json()["embedding"]) diff --git a/src/template_project/adapters/vector_generators/resume.py b/src/template_project/adapters/vector_generators/resume.py new file mode 100644 index 0000000..7c202e9 --- /dev/null +++ b/src/template_project/adapters/vector_generators/resume.py @@ -0,0 +1,36 @@ +from typing import Final, override + +from template_project.application.common.enums import ExperienceType + +from template_project.adapters.ml_client import MlApiGateway +from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator + +EMBEDDING_TEXT_TEMPLATE: Final = """ + Позиция: {position} + Опыт: {experience_type} + Ключевые навыки: {key_skills} + Описание: {about_me} +""" + + +class DefaultResumeEmbeddingVectorGenerator(ResumeEmbeddingVectorGenerator): + def __init__(self, ml_api_gateway: MlApiGateway) -> None: + self._ml_api_gateway = ml_api_gateway + + @override + async def generate( + self, + position: str, + about_me: str, + experience_type: ExperienceType, + key_skills: list[str], + ) -> list[float]: + text = EMBEDDING_TEXT_TEMPLATE.format_map( + { + "position": position, + "experience_type": experience_type, + "key_skills": ", ".join(key_skills), + "about_me": about_me, + } + ) + return await self._ml_api_gateway.generate_embedding(text) diff --git a/src/template_project/application/resume/data_gateway.py b/src/template_project/application/resume/data_gateway.py index 44008cf..80acfad 100644 --- a/src/template_project/application/resume/data_gateway.py +++ b/src/template_project/application/resume/data_gateway.py @@ -4,7 +4,6 @@ from typing import Protocol from template_project.application.resume.entity import ( Resume, - ResumeEmbeddingId, ResumeId, ResumePrediction, ) @@ -12,10 +11,6 @@ from template_project.application.user.entity import UserId class ResumeDataGateway(Protocol): - @abstractmethod - async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]: - raise NotImplementedError - @abstractmethod async def load(self, resume_id: ResumeId) -> Resume: raise NotImplementedError diff --git a/src/template_project/application/resume/resume_embedding_pipeline.py b/src/template_project/application/resume/resume_embedding_pipeline.py index 6455335..408e9e2 100644 --- a/src/template_project/application/resume/resume_embedding_pipeline.py +++ b/src/template_project/application/resume/resume_embedding_pipeline.py @@ -3,19 +3,20 @@ from collections.abc import Callable from Levenshtein import ratio from template_project.application.common.unit_of_work import UnitOfWork -from template_project.application.resume.data_gateway import ResumeDataGateway from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator +from template_project.application.vacancy.data_gateway import VacancyDataGateway +from template_project.application.vacancy.entity import Vacancy -def suitable_resumes_key( +def suitable_vacancies_key( resume: Resume, -) -> Callable[[Resume], bool]: - def wrapper(suitable_resume: Resume) -> bool: +) -> Callable[[Vacancy], bool]: + def wrapper(vacancy: Vacancy) -> bool: count_skills = 0 ratio_skill_sum = 0.0 for resum_key_skill in resume.key_skills: - for suitable_resume_key_skill in suitable_resume.key_skills: + for suitable_resume_key_skill in vacancy.key_skills: ratio_skill = ratio(resum_key_skill, suitable_resume_key_skill) if ratio_skill != 0: count_skills += 1 @@ -26,7 +27,7 @@ def suitable_resumes_key( except ZeroDivisionError: matching_skills = 0 - return resume.experience_type == suitable_resume.experience_type and matching_skills >= 50 + return resume.experience_type == vacancy.experience_type and matching_skills >= 50 return wrapper @@ -35,11 +36,11 @@ class ResumeEmbeddingPipeline: def __init__( self, unit_of_work: UnitOfWork, - resume_data_gateway: ResumeDataGateway, + vacancy_data_gateway: VacancyDataGateway, vector_generator: ResumeEmbeddingVectorGenerator, ) -> None: self.unit_of_work = unit_of_work - self.resume_data_gateway = resume_data_gateway + self.vacancy_data_gateway = vacancy_data_gateway self.vector_generator = vector_generator async def run( @@ -50,18 +51,20 @@ class ResumeEmbeddingPipeline: position=resume.position, about_me=resume.about_me, key_skills=resume.key_skills, + experience_type=resume.experience_type, ) resume_embedding = ResumeEmbedding.factory( resume_id=resume.id, vector=vector, ) - suitable_resumes = await self.resume_data_gateway.get_suitable_resumes(resume_embedding.id) - suitable_resumes_filtered = sorted( - suitable_resumes, - key=suitable_resumes_key(resume), + suitable_vacancies = await self.vacancy_data_gateway.get_suitable(resume_embedding.vector) + suitable_vacancies_filtered = sorted( + suitable_vacancies, + key=suitable_vacancies_key(resume), ) - suitable_resumes = suitable_resumes_filtered[:50] + + suitable_vacancies = suitable_vacancies_filtered[:50] # TODO: тут надо сделать отправку в ИИ diff --git a/src/template_project/application/resume/resume_prediction.py b/src/template_project/application/resume/resume_prediction.py new file mode 100644 index 0000000..16f79c2 --- /dev/null +++ b/src/template_project/application/resume/resume_prediction.py @@ -0,0 +1 @@ +# class ResumePredicition diff --git a/src/template_project/application/resume/vector_generator.py b/src/template_project/application/resume/vector_generator.py index 8b16902..8d9c67b 100644 --- a/src/template_project/application/resume/vector_generator.py +++ b/src/template_project/application/resume/vector_generator.py @@ -1,13 +1,15 @@ from abc import abstractmethod -from typing import Protocol + +from template_project.application.common.enums import ExperienceType -class ResumeEmbeddingVectorGenerator(Protocol): +class ResumeEmbeddingVectorGenerator: @abstractmethod async def generate( self, position: str, about_me: str, + experience_type: ExperienceType, key_skills: list[str], ) -> list[float]: raise NotImplementedError diff --git a/src/template_project/application/vacancy/data_gateway.py b/src/template_project/application/vacancy/data_gateway.py new file mode 100644 index 0000000..b0f83f3 --- /dev/null +++ b/src/template_project/application/vacancy/data_gateway.py @@ -0,0 +1,11 @@ +from abc import abstractmethod +from collections.abc import Sequence +from typing import Protocol + +from template_project.application.vacancy.entity import Vacancy + + +class VacancyDataGateway(Protocol): + @abstractmethod + async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]: + raise NotImplementedError diff --git a/src/template_project/ml/interactors/predict_salary.py b/src/template_project/ml/interactors/predict_salary.py index 8aab671..0775113 100644 --- a/src/template_project/ml/interactors/predict_salary.py +++ b/src/template_project/ml/interactors/predict_salary.py @@ -32,8 +32,7 @@ class PredictSalaryResponse: class PredictSalaryInteractor: async def execute(self, request: PredictSalaryRequest) -> PredictSalaryResponse: return PredictSalaryResponse( - salary_from=Decimal("50000"), - salary_to=Decimal("80000"), + salary_from=Decimal(50000), + salary_to=Decimal(80000), recommended_skills=["python", "django", "postgresql"], ) - diff --git a/src/template_project/ml/ioc/interactor.py b/src/template_project/ml/ioc/interactor.py index 6cffda7..c56095d 100644 --- a/src/template_project/ml/ioc/interactor.py +++ b/src/template_project/ml/ioc/interactor.py @@ -9,4 +9,3 @@ class InteractorProvider(Provider): interactors = provide_all( PredictSalaryInteractor, ) - diff --git a/src/template_project/ml/routes/predict.py b/src/template_project/ml/routes/predict.py index 1d63f9d..e678f30 100644 --- a/src/template_project/ml/routes/predict.py +++ b/src/template_project/ml/routes/predict.py @@ -2,14 +2,13 @@ from decimal import Decimal from dishka import FromDishka from dishka.integrations.fastapi import DishkaRoute -from fastapi import APIRouter, status +from fastapi import APIRouter from pydantic import BaseModel, Field from template_project.application.resume.entity import ResumeId from template_project.ml.interactors.predict_salary import ( PredictSalaryInteractor, PredictSalaryRequest, - PredictSalaryResponse, VacancyInput, ) @@ -120,4 +119,3 @@ async def predict_salary( salary_to=response.salary_to, recommended_skills=response.recommended_skills, ) - diff --git a/src/template_project/web_api/ioc/data_gateway.py b/src/template_project/web_api/ioc/data_gateway.py index 6b36472..7b55d76 100644 --- a/src/template_project/web_api/ioc/data_gateway.py +++ b/src/template_project/web_api/ioc/data_gateway.py @@ -7,6 +7,7 @@ from template_project.adapters.data_gateways.notification_device import DefaultN from template_project.adapters.data_gateways.profile import DefaultProfileDataGateway from template_project.adapters.data_gateways.resume import DefaultResumeDataGateway, DefaultResumePredictionDataGateway from template_project.adapters.data_gateways.user import DefaultUserDataGateway +from template_project.adapters.data_gateways.vacancy import DefaultVacancyDataGateway from template_project.adapters.unit_of_work import DefaultUnitOfWork @@ -16,6 +17,7 @@ class DataGatewayProvider(Provider): unit_of_work = provide(WithParents[DefaultUnitOfWork]) data_gateways = provide_all( KeySkillsDataGateway, + WithParents[DefaultVacancyDataGateway], WithParents[DefaultUserDataGateway], WithParents[DefaultAccessTokenDataGateway], WithParents[DefaultAuthIdentityDataGateway],