не помнб

This commit is contained in:
ivankirpichnikov
2025-11-22 18:15:29 +03:00
parent 995141a200
commit effbcfbc2d
17 changed files with 210 additions and 54 deletions
+33 -14
View File
@@ -1,30 +1,49 @@
import json import json
import logging
import urllib.parse
from collections.abc import Sequence
from pathlib import Path from pathlib import Path
from typing import Final
from adaptix import DebugTrail, NameStyle, Retort, name_mapping from adaptix import DebugTrail, NameStyle, Retort, name_mapping
from dataset.data_structures import DataSetLine, Salary from dataset.data_structures import DataSetLine, Salary
from dataset.upload_key_skills import upload_key_kills
retort = Retort( DATASET_PATH: Final = Path("hh_ru_vacancies.jsonlines")
BASE_URL: Final = "https://team-39-alpha-gm5qjkou.hack.prodcontest.ru"
UPLOAD_KEY_SKILLS: Final = urllib.parse.urljoin(BASE_URL, "key_skills")
logger = logging.getLogger(__name__)
def parse_dataset(file_path: Path) -> Sequence[DataSetLine]:
retort = Retort(
recipe=[ recipe=[
name_mapping(Salary, name_style=NameStyle.CAMEL), name_mapping(Salary, name_style=NameStyle.CAMEL),
], ],
debug_trail=DebugTrail.DISABLE, debug_trail=DebugTrail.DISABLE,
strict_coercion=False, strict_coercion=False,
) )
raw_lines = [] raw_lines = []
with Path("hh_ru_vacancies.jsonlines").open("r", encoding="utf-8") as f: with file_path.open("r", encoding="utf-8") as f:
raw_lines = map(json.loads, f.readlines()) raw_lines = map(json.loads, f.readlines())
lines = retort.load(raw_lines, list[DataSetLine]) return retort.load(raw_lines, Sequence[DataSetLine])
f = set()
c = 0
for line in lines:
if c == 1000:
break
if line.experience:
f.add(line.experience)
c += 0
print(f)
def main() -> None:
logging.basicConfig(level=logging.INFO)
logger.info("Parsing dataset...")
dataset = parse_dataset(DATASET_PATH)
upload_key_kills(dataset, UPLOAD_KEY_SKILLS)
logger.info("finished script")
if __name__ == "__main__":
main()
+37
View File
@@ -0,0 +1,37 @@
import logging
from collections.abc import Sequence
from requests import Session
from dataset.data_structures import DataSetLine
logger = logging.getLogger(__name__)
def upload_key_kills(
dataset: Sequence[DataSetLine],
upload_endpoint: str,
max_upload_count: int | None = None
) -> None:
session = Session()
key_skills = []
for count, line in enumerate(dataset):
if max_upload_count is not None and count >= max_upload_count:
break
key_skills.extend(line.key_skills)
logger.info("Upload skills %r", key_skills)
response = session.post(
upload_endpoint,
json={
"key_skills": key_skills
}
)
if response.status_code != 200:
logger.warning("Doesn't upload skills. Status code %r", response.status_code)
else:
logger.info("Upload skills %r. Status code %r", key_skills, response.status_code)
key_skills = []
@@ -15,7 +15,7 @@ class KeySkillsDataGateway:
async def query(self, query: str) -> Sequence[str]: async def query(self, query: str) -> Sequence[str]:
statement = ( statement = (
select(key_skills_table.c.name) select(key_skills_table.c.name)
.where(key_skills_table.c.name.ilike(f"{query}%")) .where(key_skills_table.c.name.ilike(f"%{query}%"))
.order_by(key_skills_table.c.name) .order_by(key_skills_table.c.name)
.limit(30) .limit(30)
) )
@@ -6,7 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from template_project.adapters.data_gateways.tables import resume_prediction_table, resume_table from template_project.adapters.data_gateways.tables import resume_prediction_table, resume_table
from template_project.application.resume.data_gateway import ResumeDataGateway, ResumePredictionDataGateway from template_project.application.resume.data_gateway import ResumeDataGateway, ResumePredictionDataGateway
from template_project.application.resume.entity import Resume, ResumeEmbeddingId, ResumeId, ResumePrediction from template_project.application.resume.entity import Resume, ResumeId, ResumePrediction
from template_project.application.resume.errors import ResumeNotFoundError from template_project.application.resume.errors import ResumeNotFoundError
from template_project.application.user.entity import UserId from template_project.application.user.entity import UserId
@@ -15,10 +15,6 @@ class DefaultResumeDataGateway(ResumeDataGateway):
def __init__(self, session: AsyncSession) -> None: def __init__(self, session: AsyncSession) -> None:
self._session = session self._session = session
@override
async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]:
raise NotImplementedError
@override @override
async def load(self, resume_id: ResumeId) -> Resume: async def load(self, resume_id: ResumeId) -> Resume:
resume = await self._session.get(Resume, resume_id) resume = await self._session.get(Resume, resume_id)
@@ -3,7 +3,6 @@ from typing import Any, Final, override
from pgvector.sqlalchemy import Vector from pgvector.sqlalchemy import Vector
from sqlalchemy import ( from sqlalchemy import (
ARRAY,
Boolean, Boolean,
Column, Column,
DateTime, DateTime,
@@ -23,6 +22,7 @@ from sqlalchemy.orm import registry
from template_project.application.access_token.entity import AccessToken from template_project.application.access_token.entity import AccessToken
from template_project.application.auth_identity.entity import AuthIdentity, AuthMethod from template_project.application.auth_identity.entity import AuthIdentity, AuthMethod
from template_project.application.common.enums import ExperienceType
from template_project.application.notification_device.entity import NotificationDevice from template_project.application.notification_device.entity import NotificationDevice
from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction
from template_project.application.user.entity import User from template_project.application.user.entity import User
@@ -160,6 +160,28 @@ key_skills_table: Final = Table(
Column("id", Integer, autoincrement=True, primary_key=True), Column("id", Integer, autoincrement=True, primary_key=True),
Column("name", String, nullable=False, unique=True) Column("name", String, nullable=False, unique=True)
) )
vacancy_table: Final = Table(
"vacancies",
meta_data,
Column("id", UUID, primary_key=True),
Column("deleted_at", DateTime(timezone=True)),
Column("created_at", DateTime(timezone=True), nullable=False),
Column("position", String, nullable=False),
Column("from_salary", Numeric, nullable=False),
Column("to_salary", Numeric, nullable=False),
Column("experience_type", Enum(ExperienceType), nullable=False),
Column("description", nullable=False),
Column("key_skills", nullable=False),
)
vacancy_embedding_table: Final = Table(
"vacancy_embedding",
meta_data,
Column("id", UUID, primary_key=True),
Column("deleted_at", DateTime(timezone=True)),
Column("created_at", DateTime(timezone=True), nullable=False),
Column("vacancy_id", UUID, ForeignKey("vacancies.id", ondelete="CASCADE"), nullable=False),
Column("vector", Vector, nullable=False),
)
mapper_registry.map_imperatively(User, user_table) mapper_registry.map_imperatively(User, user_table)
@@ -0,0 +1,24 @@
from collections.abc import Sequence
from typing import override
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from template_project.adapters.data_gateways.tables import vacancy_embedding_table, vacancy_table
from template_project.application.vacancy.data_gateway import VacancyDataGateway
from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding
class DefaultVacancyDataGateway(VacancyDataGateway):
def __init__(self, session: AsyncSession) -> None:
self._session = session
@override
async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]:
statement = (
select(Vacancy)
.join(VacancyEmbedding, vacancy_embedding_table.c.id == vacancy_table.c.id)
.where(vacancy_embedding_table.c.vector.cosine_distance(vector) > 0.5)
)
result = await self._session.execute(statement)
return result.scalars().all()
@@ -0,0 +1,12 @@
from typing import cast
from httpx import AsyncClient
class MlApiGateway:
def __init__(self, client: AsyncClient) -> None:
self._client = client
async def generate_embedding(self, text: str) -> list[float]:
response = await self._client.post("/get_embedding", json={"text": text})
return cast(list[float], response.json()["embedding"])
@@ -0,0 +1,36 @@
from typing import Final, override
from template_project.application.common.enums import ExperienceType
from template_project.adapters.ml_client import MlApiGateway
from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator
EMBEDDING_TEXT_TEMPLATE: Final = """
Позиция: {position}
Опыт: {experience_type}
Ключевые навыки: {key_skills}
Описание: {about_me}
"""
class DefaultResumeEmbeddingVectorGenerator(ResumeEmbeddingVectorGenerator):
def __init__(self, ml_api_gateway: MlApiGateway) -> None:
self._ml_api_gateway = ml_api_gateway
@override
async def generate(
self,
position: str,
about_me: str,
experience_type: ExperienceType,
key_skills: list[str],
) -> list[float]:
text = EMBEDDING_TEXT_TEMPLATE.format_map(
{
"position": position,
"experience_type": experience_type,
"key_skills": ", ".join(key_skills),
"about_me": about_me,
}
)
return await self._ml_api_gateway.generate_embedding(text)
@@ -4,7 +4,6 @@ from typing import Protocol
from template_project.application.resume.entity import ( from template_project.application.resume.entity import (
Resume, Resume,
ResumeEmbeddingId,
ResumeId, ResumeId,
ResumePrediction, ResumePrediction,
) )
@@ -12,10 +11,6 @@ from template_project.application.user.entity import UserId
class ResumeDataGateway(Protocol): class ResumeDataGateway(Protocol):
@abstractmethod
async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]:
raise NotImplementedError
@abstractmethod @abstractmethod
async def load(self, resume_id: ResumeId) -> Resume: async def load(self, resume_id: ResumeId) -> Resume:
raise NotImplementedError raise NotImplementedError
@@ -3,19 +3,20 @@ from collections.abc import Callable
from Levenshtein import ratio from Levenshtein import ratio
from template_project.application.common.unit_of_work import UnitOfWork from template_project.application.common.unit_of_work import UnitOfWork
from template_project.application.resume.data_gateway import ResumeDataGateway
from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction
from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator
from template_project.application.vacancy.data_gateway import VacancyDataGateway
from template_project.application.vacancy.entity import Vacancy
def suitable_resumes_key( def suitable_vacancies_key(
resume: Resume, resume: Resume,
) -> Callable[[Resume], bool]: ) -> Callable[[Vacancy], bool]:
def wrapper(suitable_resume: Resume) -> bool: def wrapper(vacancy: Vacancy) -> bool:
count_skills = 0 count_skills = 0
ratio_skill_sum = 0.0 ratio_skill_sum = 0.0
for resum_key_skill in resume.key_skills: for resum_key_skill in resume.key_skills:
for suitable_resume_key_skill in suitable_resume.key_skills: for suitable_resume_key_skill in vacancy.key_skills:
ratio_skill = ratio(resum_key_skill, suitable_resume_key_skill) ratio_skill = ratio(resum_key_skill, suitable_resume_key_skill)
if ratio_skill != 0: if ratio_skill != 0:
count_skills += 1 count_skills += 1
@@ -26,7 +27,7 @@ def suitable_resumes_key(
except ZeroDivisionError: except ZeroDivisionError:
matching_skills = 0 matching_skills = 0
return resume.experience_type == suitable_resume.experience_type and matching_skills >= 50 return resume.experience_type == vacancy.experience_type and matching_skills >= 50
return wrapper return wrapper
@@ -35,11 +36,11 @@ class ResumeEmbeddingPipeline:
def __init__( def __init__(
self, self,
unit_of_work: UnitOfWork, unit_of_work: UnitOfWork,
resume_data_gateway: ResumeDataGateway, vacancy_data_gateway: VacancyDataGateway,
vector_generator: ResumeEmbeddingVectorGenerator, vector_generator: ResumeEmbeddingVectorGenerator,
) -> None: ) -> None:
self.unit_of_work = unit_of_work self.unit_of_work = unit_of_work
self.resume_data_gateway = resume_data_gateway self.vacancy_data_gateway = vacancy_data_gateway
self.vector_generator = vector_generator self.vector_generator = vector_generator
async def run( async def run(
@@ -50,18 +51,20 @@ class ResumeEmbeddingPipeline:
position=resume.position, position=resume.position,
about_me=resume.about_me, about_me=resume.about_me,
key_skills=resume.key_skills, key_skills=resume.key_skills,
experience_type=resume.experience_type,
) )
resume_embedding = ResumeEmbedding.factory( resume_embedding = ResumeEmbedding.factory(
resume_id=resume.id, resume_id=resume.id,
vector=vector, vector=vector,
) )
suitable_resumes = await self.resume_data_gateway.get_suitable_resumes(resume_embedding.id) suitable_vacancies = await self.vacancy_data_gateway.get_suitable(resume_embedding.vector)
suitable_resumes_filtered = sorted( suitable_vacancies_filtered = sorted(
suitable_resumes, suitable_vacancies,
key=suitable_resumes_key(resume), key=suitable_vacancies_key(resume),
) )
suitable_resumes = suitable_resumes_filtered[:50]
suitable_vacancies = suitable_vacancies_filtered[:50]
# TODO: тут надо сделать отправку в ИИ # TODO: тут надо сделать отправку в ИИ
@@ -0,0 +1 @@
# class ResumePredicition
@@ -1,13 +1,15 @@
from abc import abstractmethod from abc import abstractmethod
from typing import Protocol
from template_project.application.common.enums import ExperienceType
class ResumeEmbeddingVectorGenerator(Protocol): class ResumeEmbeddingVectorGenerator:
@abstractmethod @abstractmethod
async def generate( async def generate(
self, self,
position: str, position: str,
about_me: str, about_me: str,
experience_type: ExperienceType,
key_skills: list[str], key_skills: list[str],
) -> list[float]: ) -> list[float]:
raise NotImplementedError raise NotImplementedError
@@ -0,0 +1,11 @@
from abc import abstractmethod
from collections.abc import Sequence
from typing import Protocol
from template_project.application.vacancy.entity import Vacancy
class VacancyDataGateway(Protocol):
@abstractmethod
async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]:
raise NotImplementedError
@@ -32,8 +32,7 @@ class PredictSalaryResponse:
class PredictSalaryInteractor: class PredictSalaryInteractor:
async def execute(self, request: PredictSalaryRequest) -> PredictSalaryResponse: async def execute(self, request: PredictSalaryRequest) -> PredictSalaryResponse:
return PredictSalaryResponse( return PredictSalaryResponse(
salary_from=Decimal("50000"), salary_from=Decimal(50000),
salary_to=Decimal("80000"), salary_to=Decimal(80000),
recommended_skills=["python", "django", "postgresql"], recommended_skills=["python", "django", "postgresql"],
) )
@@ -9,4 +9,3 @@ class InteractorProvider(Provider):
interactors = provide_all( interactors = provide_all(
PredictSalaryInteractor, PredictSalaryInteractor,
) )
+1 -3
View File
@@ -2,14 +2,13 @@ from decimal import Decimal
from dishka import FromDishka from dishka import FromDishka
from dishka.integrations.fastapi import DishkaRoute from dishka.integrations.fastapi import DishkaRoute
from fastapi import APIRouter, status from fastapi import APIRouter
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from template_project.application.resume.entity import ResumeId from template_project.application.resume.entity import ResumeId
from template_project.ml.interactors.predict_salary import ( from template_project.ml.interactors.predict_salary import (
PredictSalaryInteractor, PredictSalaryInteractor,
PredictSalaryRequest, PredictSalaryRequest,
PredictSalaryResponse,
VacancyInput, VacancyInput,
) )
@@ -120,4 +119,3 @@ async def predict_salary(
salary_to=response.salary_to, salary_to=response.salary_to,
recommended_skills=response.recommended_skills, recommended_skills=response.recommended_skills,
) )
@@ -7,6 +7,7 @@ from template_project.adapters.data_gateways.notification_device import DefaultN
from template_project.adapters.data_gateways.profile import DefaultProfileDataGateway from template_project.adapters.data_gateways.profile import DefaultProfileDataGateway
from template_project.adapters.data_gateways.resume import DefaultResumeDataGateway, DefaultResumePredictionDataGateway from template_project.adapters.data_gateways.resume import DefaultResumeDataGateway, DefaultResumePredictionDataGateway
from template_project.adapters.data_gateways.user import DefaultUserDataGateway from template_project.adapters.data_gateways.user import DefaultUserDataGateway
from template_project.adapters.data_gateways.vacancy import DefaultVacancyDataGateway
from template_project.adapters.unit_of_work import DefaultUnitOfWork from template_project.adapters.unit_of_work import DefaultUnitOfWork
@@ -16,6 +17,7 @@ class DataGatewayProvider(Provider):
unit_of_work = provide(WithParents[DefaultUnitOfWork]) unit_of_work = provide(WithParents[DefaultUnitOfWork])
data_gateways = provide_all( data_gateways = provide_all(
KeySkillsDataGateway, KeySkillsDataGateway,
WithParents[DefaultVacancyDataGateway],
WithParents[DefaultUserDataGateway], WithParents[DefaultUserDataGateway],
WithParents[DefaultAccessTokenDataGateway], WithParents[DefaultAccessTokenDataGateway],
WithParents[DefaultAuthIdentityDataGateway], WithParents[DefaultAuthIdentityDataGateway],