You've already forked RekomenciBackend
не помнб
This commit is contained in:
+39
-20
@@ -1,30 +1,49 @@
|
||||
import json
|
||||
import logging
|
||||
import urllib.parse
|
||||
from collections.abc import Sequence
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from adaptix import DebugTrail, NameStyle, Retort, name_mapping
|
||||
|
||||
from dataset.data_structures import DataSetLine, Salary
|
||||
from dataset.upload_key_skills import upload_key_kills
|
||||
|
||||
retort = Retort(
|
||||
recipe=[
|
||||
name_mapping(Salary, name_style=NameStyle.CAMEL),
|
||||
],
|
||||
debug_trail=DebugTrail.DISABLE,
|
||||
strict_coercion=False,
|
||||
)
|
||||
DATASET_PATH: Final = Path("hh_ru_vacancies.jsonlines")
|
||||
BASE_URL: Final = "https://team-39-alpha-gm5qjkou.hack.prodcontest.ru"
|
||||
UPLOAD_KEY_SKILLS: Final = urllib.parse.urljoin(BASE_URL, "key_skills")
|
||||
|
||||
raw_lines = []
|
||||
with Path("hh_ru_vacancies.jsonlines").open("r", encoding="utf-8") as f:
|
||||
raw_lines = map(json.loads, f.readlines())
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
lines = retort.load(raw_lines, list[DataSetLine])
|
||||
f = set()
|
||||
c = 0
|
||||
for line in lines:
|
||||
if c == 1000:
|
||||
break
|
||||
if line.experience:
|
||||
f.add(line.experience)
|
||||
c += 0
|
||||
|
||||
print(f)
|
||||
def parse_dataset(file_path: Path) -> Sequence[DataSetLine]:
|
||||
retort = Retort(
|
||||
recipe=[
|
||||
name_mapping(Salary, name_style=NameStyle.CAMEL),
|
||||
],
|
||||
debug_trail=DebugTrail.DISABLE,
|
||||
strict_coercion=False,
|
||||
)
|
||||
|
||||
raw_lines = []
|
||||
with file_path.open("r", encoding="utf-8") as f:
|
||||
raw_lines = map(json.loads, f.readlines())
|
||||
|
||||
return retort.load(raw_lines, Sequence[DataSetLine])
|
||||
|
||||
|
||||
def main() -> None:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
logger.info("Parsing dataset...")
|
||||
|
||||
dataset = parse_dataset(DATASET_PATH)
|
||||
|
||||
upload_key_kills(dataset, UPLOAD_KEY_SKILLS)
|
||||
|
||||
logger.info("finished script")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import logging
|
||||
from collections.abc import Sequence
|
||||
|
||||
from requests import Session
|
||||
|
||||
from dataset.data_structures import DataSetLine
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def upload_key_kills(
|
||||
dataset: Sequence[DataSetLine],
|
||||
upload_endpoint: str,
|
||||
max_upload_count: int | None = None
|
||||
) -> None:
|
||||
session = Session()
|
||||
|
||||
key_skills = []
|
||||
for count, line in enumerate(dataset):
|
||||
if max_upload_count is not None and count >= max_upload_count:
|
||||
break
|
||||
|
||||
key_skills.extend(line.key_skills)
|
||||
|
||||
logger.info("Upload skills %r", key_skills)
|
||||
response = session.post(
|
||||
upload_endpoint,
|
||||
json={
|
||||
"key_skills": key_skills
|
||||
}
|
||||
)
|
||||
if response.status_code != 200:
|
||||
logger.warning("Doesn't upload skills. Status code %r", response.status_code)
|
||||
else:
|
||||
logger.info("Upload skills %r. Status code %r", key_skills, response.status_code)
|
||||
|
||||
key_skills = []
|
||||
@@ -15,7 +15,7 @@ class KeySkillsDataGateway:
|
||||
async def query(self, query: str) -> Sequence[str]:
|
||||
statement = (
|
||||
select(key_skills_table.c.name)
|
||||
.where(key_skills_table.c.name.ilike(f"{query}%"))
|
||||
.where(key_skills_table.c.name.ilike(f"%{query}%"))
|
||||
.order_by(key_skills_table.c.name)
|
||||
.limit(30)
|
||||
)
|
||||
|
||||
@@ -6,7 +6,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from template_project.adapters.data_gateways.tables import resume_prediction_table, resume_table
|
||||
from template_project.application.resume.data_gateway import ResumeDataGateway, ResumePredictionDataGateway
|
||||
from template_project.application.resume.entity import Resume, ResumeEmbeddingId, ResumeId, ResumePrediction
|
||||
from template_project.application.resume.entity import Resume, ResumeId, ResumePrediction
|
||||
from template_project.application.resume.errors import ResumeNotFoundError
|
||||
from template_project.application.user.entity import UserId
|
||||
|
||||
@@ -15,10 +15,6 @@ class DefaultResumeDataGateway(ResumeDataGateway):
|
||||
def __init__(self, session: AsyncSession) -> None:
|
||||
self._session = session
|
||||
|
||||
@override
|
||||
async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]:
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
async def load(self, resume_id: ResumeId) -> Resume:
|
||||
resume = await self._session.get(Resume, resume_id)
|
||||
|
||||
@@ -3,7 +3,6 @@ from typing import Any, Final, override
|
||||
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from sqlalchemy import (
|
||||
ARRAY,
|
||||
Boolean,
|
||||
Column,
|
||||
DateTime,
|
||||
@@ -23,6 +22,7 @@ from sqlalchemy.orm import registry
|
||||
|
||||
from template_project.application.access_token.entity import AccessToken
|
||||
from template_project.application.auth_identity.entity import AuthIdentity, AuthMethod
|
||||
from template_project.application.common.enums import ExperienceType
|
||||
from template_project.application.notification_device.entity import NotificationDevice
|
||||
from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction
|
||||
from template_project.application.user.entity import User
|
||||
@@ -160,6 +160,28 @@ key_skills_table: Final = Table(
|
||||
Column("id", Integer, autoincrement=True, primary_key=True),
|
||||
Column("name", String, nullable=False, unique=True)
|
||||
)
|
||||
vacancy_table: Final = Table(
|
||||
"vacancies",
|
||||
meta_data,
|
||||
Column("id", UUID, primary_key=True),
|
||||
Column("deleted_at", DateTime(timezone=True)),
|
||||
Column("created_at", DateTime(timezone=True), nullable=False),
|
||||
Column("position", String, nullable=False),
|
||||
Column("from_salary", Numeric, nullable=False),
|
||||
Column("to_salary", Numeric, nullable=False),
|
||||
Column("experience_type", Enum(ExperienceType), nullable=False),
|
||||
Column("description", nullable=False),
|
||||
Column("key_skills", nullable=False),
|
||||
)
|
||||
vacancy_embedding_table: Final = Table(
|
||||
"vacancy_embedding",
|
||||
meta_data,
|
||||
Column("id", UUID, primary_key=True),
|
||||
Column("deleted_at", DateTime(timezone=True)),
|
||||
Column("created_at", DateTime(timezone=True), nullable=False),
|
||||
Column("vacancy_id", UUID, ForeignKey("vacancies.id", ondelete="CASCADE"), nullable=False),
|
||||
Column("vector", Vector, nullable=False),
|
||||
)
|
||||
|
||||
|
||||
mapper_registry.map_imperatively(User, user_table)
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
from collections.abc import Sequence
|
||||
from typing import override
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from template_project.adapters.data_gateways.tables import vacancy_embedding_table, vacancy_table
|
||||
from template_project.application.vacancy.data_gateway import VacancyDataGateway
|
||||
from template_project.application.vacancy.entity import Vacancy, VacancyEmbedding
|
||||
|
||||
|
||||
class DefaultVacancyDataGateway(VacancyDataGateway):
|
||||
def __init__(self, session: AsyncSession) -> None:
|
||||
self._session = session
|
||||
|
||||
@override
|
||||
async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]:
|
||||
statement = (
|
||||
select(Vacancy)
|
||||
.join(VacancyEmbedding, vacancy_embedding_table.c.id == vacancy_table.c.id)
|
||||
.where(vacancy_embedding_table.c.vector.cosine_distance(vector) > 0.5)
|
||||
)
|
||||
result = await self._session.execute(statement)
|
||||
return result.scalars().all()
|
||||
@@ -0,0 +1,12 @@
|
||||
from typing import cast
|
||||
|
||||
from httpx import AsyncClient
|
||||
|
||||
|
||||
class MlApiGateway:
|
||||
def __init__(self, client: AsyncClient) -> None:
|
||||
self._client = client
|
||||
|
||||
async def generate_embedding(self, text: str) -> list[float]:
|
||||
response = await self._client.post("/get_embedding", json={"text": text})
|
||||
return cast(list[float], response.json()["embedding"])
|
||||
@@ -0,0 +1,36 @@
|
||||
from typing import Final, override
|
||||
|
||||
from template_project.application.common.enums import ExperienceType
|
||||
|
||||
from template_project.adapters.ml_client import MlApiGateway
|
||||
from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator
|
||||
|
||||
EMBEDDING_TEXT_TEMPLATE: Final = """
|
||||
Позиция: {position}
|
||||
Опыт: {experience_type}
|
||||
Ключевые навыки: {key_skills}
|
||||
Описание: {about_me}
|
||||
"""
|
||||
|
||||
|
||||
class DefaultResumeEmbeddingVectorGenerator(ResumeEmbeddingVectorGenerator):
|
||||
def __init__(self, ml_api_gateway: MlApiGateway) -> None:
|
||||
self._ml_api_gateway = ml_api_gateway
|
||||
|
||||
@override
|
||||
async def generate(
|
||||
self,
|
||||
position: str,
|
||||
about_me: str,
|
||||
experience_type: ExperienceType,
|
||||
key_skills: list[str],
|
||||
) -> list[float]:
|
||||
text = EMBEDDING_TEXT_TEMPLATE.format_map(
|
||||
{
|
||||
"position": position,
|
||||
"experience_type": experience_type,
|
||||
"key_skills": ", ".join(key_skills),
|
||||
"about_me": about_me,
|
||||
}
|
||||
)
|
||||
return await self._ml_api_gateway.generate_embedding(text)
|
||||
@@ -4,7 +4,6 @@ from typing import Protocol
|
||||
|
||||
from template_project.application.resume.entity import (
|
||||
Resume,
|
||||
ResumeEmbeddingId,
|
||||
ResumeId,
|
||||
ResumePrediction,
|
||||
)
|
||||
@@ -12,10 +11,6 @@ from template_project.application.user.entity import UserId
|
||||
|
||||
|
||||
class ResumeDataGateway(Protocol):
|
||||
@abstractmethod
|
||||
async def get_suitable_resumes(self, embedding_id: ResumeEmbeddingId) -> Sequence[Resume]:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def load(self, resume_id: ResumeId) -> Resume:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -3,19 +3,20 @@ from collections.abc import Callable
|
||||
from Levenshtein import ratio
|
||||
|
||||
from template_project.application.common.unit_of_work import UnitOfWork
|
||||
from template_project.application.resume.data_gateway import ResumeDataGateway
|
||||
from template_project.application.resume.entity import Resume, ResumeEmbedding, ResumePrediction
|
||||
from template_project.application.resume.vector_generator import ResumeEmbeddingVectorGenerator
|
||||
from template_project.application.vacancy.data_gateway import VacancyDataGateway
|
||||
from template_project.application.vacancy.entity import Vacancy
|
||||
|
||||
|
||||
def suitable_resumes_key(
|
||||
def suitable_vacancies_key(
|
||||
resume: Resume,
|
||||
) -> Callable[[Resume], bool]:
|
||||
def wrapper(suitable_resume: Resume) -> bool:
|
||||
) -> Callable[[Vacancy], bool]:
|
||||
def wrapper(vacancy: Vacancy) -> bool:
|
||||
count_skills = 0
|
||||
ratio_skill_sum = 0.0
|
||||
for resum_key_skill in resume.key_skills:
|
||||
for suitable_resume_key_skill in suitable_resume.key_skills:
|
||||
for suitable_resume_key_skill in vacancy.key_skills:
|
||||
ratio_skill = ratio(resum_key_skill, suitable_resume_key_skill)
|
||||
if ratio_skill != 0:
|
||||
count_skills += 1
|
||||
@@ -26,7 +27,7 @@ def suitable_resumes_key(
|
||||
except ZeroDivisionError:
|
||||
matching_skills = 0
|
||||
|
||||
return resume.experience_type == suitable_resume.experience_type and matching_skills >= 50
|
||||
return resume.experience_type == vacancy.experience_type and matching_skills >= 50
|
||||
|
||||
return wrapper
|
||||
|
||||
@@ -35,11 +36,11 @@ class ResumeEmbeddingPipeline:
|
||||
def __init__(
|
||||
self,
|
||||
unit_of_work: UnitOfWork,
|
||||
resume_data_gateway: ResumeDataGateway,
|
||||
vacancy_data_gateway: VacancyDataGateway,
|
||||
vector_generator: ResumeEmbeddingVectorGenerator,
|
||||
) -> None:
|
||||
self.unit_of_work = unit_of_work
|
||||
self.resume_data_gateway = resume_data_gateway
|
||||
self.vacancy_data_gateway = vacancy_data_gateway
|
||||
self.vector_generator = vector_generator
|
||||
|
||||
async def run(
|
||||
@@ -50,18 +51,20 @@ class ResumeEmbeddingPipeline:
|
||||
position=resume.position,
|
||||
about_me=resume.about_me,
|
||||
key_skills=resume.key_skills,
|
||||
experience_type=resume.experience_type,
|
||||
)
|
||||
resume_embedding = ResumeEmbedding.factory(
|
||||
resume_id=resume.id,
|
||||
vector=vector,
|
||||
)
|
||||
|
||||
suitable_resumes = await self.resume_data_gateway.get_suitable_resumes(resume_embedding.id)
|
||||
suitable_resumes_filtered = sorted(
|
||||
suitable_resumes,
|
||||
key=suitable_resumes_key(resume),
|
||||
suitable_vacancies = await self.vacancy_data_gateway.get_suitable(resume_embedding.vector)
|
||||
suitable_vacancies_filtered = sorted(
|
||||
suitable_vacancies,
|
||||
key=suitable_vacancies_key(resume),
|
||||
)
|
||||
suitable_resumes = suitable_resumes_filtered[:50]
|
||||
|
||||
suitable_vacancies = suitable_vacancies_filtered[:50]
|
||||
|
||||
# TODO: тут надо сделать отправку в ИИ
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
# class ResumePredicition
|
||||
@@ -1,13 +1,15 @@
|
||||
from abc import abstractmethod
|
||||
from typing import Protocol
|
||||
|
||||
from template_project.application.common.enums import ExperienceType
|
||||
|
||||
|
||||
class ResumeEmbeddingVectorGenerator(Protocol):
|
||||
class ResumeEmbeddingVectorGenerator:
|
||||
@abstractmethod
|
||||
async def generate(
|
||||
self,
|
||||
position: str,
|
||||
about_me: str,
|
||||
experience_type: ExperienceType,
|
||||
key_skills: list[str],
|
||||
) -> list[float]:
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -0,0 +1,11 @@
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from typing import Protocol
|
||||
|
||||
from template_project.application.vacancy.entity import Vacancy
|
||||
|
||||
|
||||
class VacancyDataGateway(Protocol):
|
||||
@abstractmethod
|
||||
async def get_suitable(self, vector: list[float]) -> Sequence[Vacancy]:
|
||||
raise NotImplementedError
|
||||
@@ -32,8 +32,7 @@ class PredictSalaryResponse:
|
||||
class PredictSalaryInteractor:
|
||||
async def execute(self, request: PredictSalaryRequest) -> PredictSalaryResponse:
|
||||
return PredictSalaryResponse(
|
||||
salary_from=Decimal("50000"),
|
||||
salary_to=Decimal("80000"),
|
||||
salary_from=Decimal(50000),
|
||||
salary_to=Decimal(80000),
|
||||
recommended_skills=["python", "django", "postgresql"],
|
||||
)
|
||||
|
||||
|
||||
@@ -9,4 +9,3 @@ class InteractorProvider(Provider):
|
||||
interactors = provide_all(
|
||||
PredictSalaryInteractor,
|
||||
)
|
||||
|
||||
|
||||
@@ -2,14 +2,13 @@ from decimal import Decimal
|
||||
|
||||
from dishka import FromDishka
|
||||
from dishka.integrations.fastapi import DishkaRoute
|
||||
from fastapi import APIRouter, status
|
||||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from template_project.application.resume.entity import ResumeId
|
||||
from template_project.ml.interactors.predict_salary import (
|
||||
PredictSalaryInteractor,
|
||||
PredictSalaryRequest,
|
||||
PredictSalaryResponse,
|
||||
VacancyInput,
|
||||
)
|
||||
|
||||
@@ -120,4 +119,3 @@ async def predict_salary(
|
||||
salary_to=response.salary_to,
|
||||
recommended_skills=response.recommended_skills,
|
||||
)
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ from template_project.adapters.data_gateways.notification_device import DefaultN
|
||||
from template_project.adapters.data_gateways.profile import DefaultProfileDataGateway
|
||||
from template_project.adapters.data_gateways.resume import DefaultResumeDataGateway, DefaultResumePredictionDataGateway
|
||||
from template_project.adapters.data_gateways.user import DefaultUserDataGateway
|
||||
from template_project.adapters.data_gateways.vacancy import DefaultVacancyDataGateway
|
||||
from template_project.adapters.unit_of_work import DefaultUnitOfWork
|
||||
|
||||
|
||||
@@ -16,6 +17,7 @@ class DataGatewayProvider(Provider):
|
||||
unit_of_work = provide(WithParents[DefaultUnitOfWork])
|
||||
data_gateways = provide_all(
|
||||
KeySkillsDataGateway,
|
||||
WithParents[DefaultVacancyDataGateway],
|
||||
WithParents[DefaultUserDataGateway],
|
||||
WithParents[DefaultAccessTokenDataGateway],
|
||||
WithParents[DefaultAuthIdentityDataGateway],
|
||||
|
||||
Reference in New Issue
Block a user