Commit 942b7fe6 authored by nick zheng's avatar nick zheng

feat: 知识库上传文件支持自定义分段

parent 7d9d7423
......@@ -282,7 +282,7 @@ personal_space_module:
agent_title_input_placeholder: 'Please enter the application name'
agent_title_input_rule_message: 'Please enter the application name'
agent_desc_input_placeholder: 'Please describe your app, which will be displayed regularly after the app is published'
update_avatar: 'Modify profile picture'
update_avatar: 'Modify'
agent_system_prompt: 'Role instruction'
agent_system_popover_message: 'Through the Instruction function, you can precisely set the scope of the Agent application. This includes specifying the role the application will play, the components it can use, and the format and style of the output. In addition, you can specify what actions the app cannot perform, and so on.'
optimize_agent_system_prompt: 'Optimize'
......@@ -464,6 +464,7 @@ personal_space_module:
segment: 'Segment'
auto_segment: 'Automatic segmentation'
custom_segment: 'Custom segmentation'
add_chunk_up_message: 'Add fragment up'
add_chunk_down_message: 'Add fragment down'
search_knowledge_chunk_placeholder: 'Please enter a slice name'
......@@ -472,6 +473,15 @@ personal_space_module:
knowledge_chunk_content_input_rule: 'The content cannot be empty'
delete_knowledge_chunk_content_message: 'Confirm to delete the slice'
divide_by_word_count: 'Divide by word count'
Chinese_comma: 'Chinese comma'
Chinese_period: 'Chinese period'
Chinese_question_mark: 'Chinese question mark'
Chinese_exclamation_mark: 'Chinese exclamation mark'
English_period: 'English period'
English_exclamation_mark: 'English exclamation mark'
ellipsis: 'Ellipsis'
upload_document_module:
segment_setting: 'Segmented processing'
data_process: 'Data processing'
......@@ -486,6 +496,16 @@ personal_space_module:
upload_error_message: 'Upload failed. Please delete the file and try again'
default_segment_setting_title: 'Automatic segmentation and cleaning'
default_segment_setting_desc: 'Automatic segmentation and preprocessing rules'
custom_segment_setting_desc: 'Custom segmentation rules, segmentation lengths, and preprocessing rules'
segment_identifier: 'Segment identifier'
segment_identifier_tip: 'Divide the text according to the selected identifier'
please_select_segment_identifier: 'Please select segment identifier'
segment_maximum_number_of_words: 'Segment maximum number of words'
segment_maximum_number_of_words_tip: 'The segmented word count should match the context length range of the model in the application. The longer the word count, the richer the context of the recall, the fewer the word count, the more concise the recall information, and the maximum word count cannot exceed 1000 words'
please_input_segment_word_number: 'Please input segment word number'
segment_overlap_word_proportion: 'Segment overlap word proportion'
segment_overlap_word_proportion_tip: 'The ratio of the "number of overlapping characters" between the current slice and the previous slice to the set "Maximum length of slice". If there are incomplete sentences in the overlapping part, this slice deletes the sentence, the larger the proportion, the more overlapping characters in the adjacent slice, the smaller the proportion, the fewer overlapping characters, and the maximum bit 25'
please_input_segment_overlap_word_proportion: 'Please input segment overlap word proportion'
data_process_tip_message: 'Click confirm does not affect the data processing, after processing can be referenced'
share_agent_module:
......
......@@ -462,6 +462,7 @@ personal_space_module:
segment: '分段'
auto_segment: '自动分段'
custom_segment: '自定义分段'
add_chunk_up_message: '向上添加分片'
add_chunk_down_message: '向下添加分片'
search_knowledge_chunk_placeholder: '请输入切片名称'
......@@ -470,6 +471,15 @@ personal_space_module:
knowledge_chunk_content_input_rule: '内容不能为空'
delete_knowledge_chunk_content_message: '确认删除该切片'
divide_by_word_count: '按字数切分'
Chinese_comma: '中文逗号'
Chinese_period: '中文句号'
Chinese_question_mark: '中文问号'
Chinese_exclamation_mark: '中文感叹号'
English_period: '英文句号'
English_exclamation_mark: '英文感叹号'
ellipsis: '省略号'
upload_document_module:
segment_setting: '分段处理'
data_process: '数据处理'
......@@ -484,6 +494,16 @@ personal_space_module:
upload_error_message: '上传失败,请删除文件后再进行操作'
default_segment_setting_title: '自动分段与清洗'
default_segment_setting_desc: '自动分段与预处理规则'
custom_segment_setting_desc: '自定义分段规则、分段长度及预处理规则'
segment_identifier: '分段标识符'
segment_identifier_tip: '按照所选的标识符切分文本'
please_select_segment_identifier: '请选择分段标识符'
segment_maximum_number_of_words: '分段最多字数'
segment_maximum_number_of_words_tip: '分段字数应匹配应用中的模型上下文长度范围,字数越长,召回的上下文越丰富,字数越少,召回的信息更精简,最多字数不能超过1000字'
please_input_segment_word_number: '请输入分段字数'
segment_overlap_word_proportion: '分段重叠字数占比'
segment_overlap_word_proportion_tip: '当前切片与前后切片的“重叠部分字符数”相较于设置的“切片最大长度”的比例,如果重叠部分存在不完整的句子,则此切片舍去该句,占比越大,相邻切片重叠字符越多,占比越少,重叠字符越少,最大值位25'
please_input_segment_overlap_word_proportion: '请输入分段重叠字数占比'
data_process_tip_message: '点击确认不影响数据处理,处理完毕后可进行引用'
share_agent_module:
......
......@@ -462,6 +462,7 @@ personal_space_module:
segment: '分段'
auto_segment: '自動分段'
custom_segment: '自定義分段'
add_chunk_up_message: '向上添加分片'
add_chunk_down_message: '向下添加分片'
search_knowledge_chunk_placeholder: '請輸入切片名稱'
......@@ -470,6 +471,15 @@ personal_space_module:
knowledge_chunk_content_input_rule: '內容不能為空'
delete_knowledge_chunk_content_message: '確認刪除該切片'
divide_by_word_count: '按字數切分'
Chinese_comma: '中文逗號'
Chinese_period: '中文句號'
Chinese_question_mark: '中文問號'
Chinese_exclamation_mark: '中文感嘆號'
English_period: '英文句號'
English_exclamation_mark: '英文感嘆號'
ellipsis: '省略號'
upload_document_module:
segment_setting: '分段處理'
data_process: '數據處理'
......@@ -484,6 +494,16 @@ personal_space_module:
upload_error_message: '上傳失敗,請刪除文件後再進行操作'
default_segment_setting_title: '自動分段與清洗'
default_segment_setting_desc: '自動分段與預處理規則'
custom_segment_setting_desc: '自定義分段規則、分段長度及預處理規則'
segment_identifier: '分段標識符'
segment_identifier_tip: '按照所選的標識符切分文本'
please_select_segment_identifier: '請選擇分段標識符'
segment_maximum_number_of_words: '分段最多字數'
segment_maximum_number_of_words_tip: '分段字數應匹配應用中的模型上下文長度範圍,字數越長,召回的上下文越豐富,字數越少,召回的信息更精簡,最多字數不能超過1000字'
please_input_segment_word_number: '請輸入分段字數'
segment_overlap_word_proportion: '分段重疊字數佔比'
segment_overlap_word_proportion_tip: '當前切片與前後切片的“重疊部分字符數”相較於設置的“切片最大長度”的比例,如果重疊部分存在不完整的句子,則此切片捨去該句,佔比越大,相鄰切片重疊字符越多,佔比越少,重疊字符越少,最大值位25'
please_input_segment_overlap_word_proportion: '請輸入分段重疊字數佔比'
data_process_tip_message: '點擊確認不影響數據處理,處理完畢後可進行引用'
share_agent_module:
......
......@@ -28,6 +28,7 @@ const { paginationData } = usePagination()
const currentKdId = ref(0)
const currentKnowledgeDocumentName = ref('')
const currentKnowledgeDocumentUrl = ref('')
const currentKnowledgeSegmentationType = ref('')
const scrollBarRef = ref<ScrollbarInst | null>(null)
......@@ -79,6 +80,7 @@ function handleGetKnowledgeDocumentDetail() {
if (res.code === 0) {
currentKnowledgeDocumentName.value = res.data[0].documentName
currentKnowledgeDocumentUrl.value = res.data[0].documentUrl
currentKnowledgeSegmentationType.value = res.data[0]?.segmentationConfig?.segmentationType
}
})
.catch(() => {
......@@ -229,7 +231,11 @@ async function handleGetKnowledgeChunkListUpdatePageSize(pageSize: number) {
{{ t('personal_space_module.knowledge_module.segment') }}
</li>
<li class="border-inactive-border-color select-none rounded-full border px-4 py-1 leading-[18px]">
{{ t('personal_space_module.knowledge_module.auto_segment') }}
{{
currentKnowledgeSegmentationType === 'DEFAULT'
? t('personal_space_module.knowledge_module.auto_segment')
: t('personal_space_module.knowledge_module.custom_segment')
}}
</li>
</ul>
</div>
......
......@@ -17,13 +17,13 @@ export interface KnowledgeItem {
}
export interface SegmentationConfigInterface {
segmentationType: 'Default'
segmentationType: 'DEFAULT' | 'IDENTIFIER' | 'NUMBER_Of_WORDS'
chunkSize: number
scrapSize: number
repetitionRate: number
relationInfo: string[]
regex: string
punctuations: string
punctuations: string[]
}
export interface KnowledgeDocumentItem {
......
......@@ -7,6 +7,7 @@ import UploadFile from './upload-file.vue'
import SegmentSetting from './segment-setting.vue'
import DataProcess from './data-process.vue'
import { fetchTrainKnowledge } from '@/apis/knowledge'
import { SegmentationConfigInterface } from '../../types'
interface StepItem {
stepId: number
......@@ -55,19 +56,11 @@ function handleToSegmentSetting(kdId: number[]) {
currentKdIdList.value = kdId
}
async function handleToDataProcess() {
async function handleToDataProcess(segmentationConfig: SegmentationConfigInterface) {
const res = await fetchTrainKnowledge({
knowledgeInfoId: currentKnowledgeId.value,
kdIds: currentKdIdList.value,
segmentationConfig: {
segmentationType: 'DEFAULT',
chunkSize: 300,
scrapSize: 80,
repetitionRate: 5,
relationInfo: [''],
regex: '',
punctuations: [''],
},
segmentationConfig,
})
if (res.code === 0) {
......
<script setup lang="ts">
import { computed, ref } from 'vue'
import { useI18n } from 'vue-i18n'
import { FormInst, FormRules } from 'naive-ui'
import { Help } from '@icon-park/vue-next'
import { SegmentationConfigInterface } from '../../types'
interface Emit {
(e: 'prev'): void
(e: 'next'): void
(e: 'next', value: SegmentationConfigInterface): void
}
const { t } = useI18n()
const emit = defineEmits<Emit>()
const segmentationConfig = ref<SegmentationConfigInterface>(defaultSegmentationConfig())
const segmentationConfigFormRef = ref<FormInst | null>(null)
const segmentationConfigRules: FormRules = {
punctuations: [
{
required: true,
message: t('personal_space_module.knowledge_module.upload_document_module.please_select_segment_identifier'),
trigger: ['blur', 'change'],
type: 'array',
},
],
chunkSize: [
{
type: 'number',
required: true,
message: t('personal_space_module.knowledge_module.upload_document_module.please_input_segment_word_number'),
trigger: 'blur',
},
],
repetitionRate: [
{
type: 'number',
required: true,
message: t(
'personal_space_module.knowledge_module.upload_document_module.please_input_segment_overlap_word_proportion',
),
trigger: 'blur',
},
],
}
const punctuationOptions = [
{ label: t('personal_space_module.knowledge_module.divide_by_word_count'), value: 'by_word' },
{ label: t('personal_space_module.knowledge_module.Chinese_comma'), value: ',' },
{ label: t('personal_space_module.knowledge_module.Chinese_period'), value: '。' },
{ label: t('personal_space_module.knowledge_module.Chinese_question_mark'), value: '?' },
{ label: t('personal_space_module.knowledge_module.Chinese_exclamation_mark'), value: '!' },
{ label: t('personal_space_module.knowledge_module.English_period'), value: '.' },
{ label: t('personal_space_module.knowledge_module.English_exclamation_mark'), value: '!' },
{ label: t('personal_space_module.knowledge_module.ellipsis'), value: '......' },
]
const isDefaultSegmentation = computed(() => {
return segmentationConfig.value.segmentationType === 'DEFAULT'
})
function defaultSegmentationConfig(): SegmentationConfigInterface {
return {
segmentationType: 'DEFAULT',
chunkSize: 300,
scrapSize: 80,
repetitionRate: 5,
relationInfo: [],
regex: '',
punctuations: [],
}
}
function handleUpdatePunctuations(checkedValue: string[]) {
if (checkedValue.at(-1) === 'by_word') {
segmentationConfig.value.punctuations = ['by_word']
segmentationConfig.value.segmentationType = 'NUMBER_Of_WORDS'
} else {
segmentationConfig.value.punctuations = checkedValue.filter((item) => item !== 'by_word')
segmentationConfig.value.segmentationType = 'IDENTIFIER'
}
}
function handleClickSegmentationType(segmentationType: 'DEFAULT' | 'IDENTIFIER') {
if (segmentationType === 'IDENTIFIER' && segmentationConfig.value.segmentationType === 'NUMBER_Of_WORDS') {
return
}
if (segmentationType === 'DEFAULT') {
segmentationConfig.value = defaultSegmentationConfig()
segmentationConfigFormRef.value?.restoreValidation()
return
}
segmentationConfig.value.segmentationType = segmentationType
}
function handlePrev() {
emit('prev')
}
function handleNext() {
emit('next')
segmentationConfigFormRef.value &&
segmentationConfigFormRef.value?.validate((errors) => {
if (!errors) {
emit('next', segmentationConfig.value)
}
})
!segmentationConfigFormRef.value && emit('next', segmentationConfig.value)
}
</script>
<template>
<div>
<div>
<div class="flex flex-col gap-[18px]">
<div
class="border-theme-color bg-active-color flex h-[90px] w-full cursor-pointer flex-col justify-between rounded-[10px] border px-6 py-5"
class="hover:border-theme-color flex w-full cursor-pointer flex-col justify-between rounded-[10px] border bg-white px-6 py-5"
:class="isDefaultSegmentation ? 'border-theme-color' : ''"
@click="handleClickSegmentationType('DEFAULT')"
>
<span class="text-font-color">
<span class="text-font-color mb-[7.5px]">
{{ t('personal_space_module.knowledge_module.upload_document_module.default_segment_setting_title') }}
</span>
<span class="text-gray-font-color text-[13px]">
{{ t('personal_space_module.knowledge_module.upload_document_module.default_segment_setting_desc') }}
</span>
</div>
<div
class="hover:border-theme-color flex w-full cursor-pointer flex-col justify-between rounded-[10px] border bg-white px-6 py-5"
:class="!isDefaultSegmentation ? 'border-theme-color' : ''"
@click="handleClickSegmentationType('IDENTIFIER')"
>
<span class="text-font-color mb-[7.5px]">{{ t('common_module.custom') }}</span>
<span class="text-gray-font-color text-[13px]">
{{ t('personal_space_module.knowledge_module.upload_document_module.custom_segment_setting_desc') }}
</span>
<div v-show="!isDefaultSegmentation" class="border-inactive-border-color mb-[18px] mt-[7px] border-b" />
<div v-if="!isDefaultSegmentation">
<n-form ref="segmentationConfigFormRef" :model="segmentationConfig" :rules="segmentationConfigRules">
<n-form-item path="punctuations">
<template #label>
<div class="flex items-center gap-[5px]">
<span class="text-font-color">
{{ t('personal_space_module.knowledge_module.upload_document_module.segment_identifier') }}
</span>
<n-popover trigger="hover" class="max-w-[300px]">
<template #trigger>
<Help theme="outline" size="15" fill="#333" :stroke-width="3" class="mt-[2px] cursor-pointer" />
</template>
<span>
{{ t('personal_space_module.knowledge_module.upload_document_module.segment_identifier_tip') }}
</span>
</n-popover>
</div>
</template>
<n-select
v-model:value="segmentationConfig.punctuations"
:options="punctuationOptions"
multiple
max-tag-count="responsive"
:placeholder="
t('personal_space_module.knowledge_module.upload_document_module.please_select_segment_identifier')
"
@update:value="handleUpdatePunctuations"
/>
</n-form-item>
<n-form-item path="chunkSize">
<template #label>
<div class="flex items-center gap-[5px]">
<span class="text-font-color">
{{
t('personal_space_module.knowledge_module.upload_document_module.segment_maximum_number_of_words')
}}
</span>
<n-popover trigger="hover" class="max-w-[300px]">
<template #trigger>
<Help theme="outline" size="15" fill="#333" :stroke-width="3" class="mt-[2px] cursor-pointer" />
</template>
<span>
{{
t(
'personal_space_module.knowledge_module.upload_document_module.segment_maximum_number_of_words_tip',
)
}}
</span>
</n-popover>
</div>
</template>
<n-input-number
v-model:value="segmentationConfig.chunkSize"
:min="0"
:max="1000"
:show-button="false"
:placeholder="
t('personal_space_module.knowledge_module.upload_document_module.please_input_segment_word_number')
"
class="w-full"
/>
</n-form-item>
<n-form-item path="repetitionRate">
<template #label>
<div class="flex items-center gap-[5px]">
<span class="text-font-color">
{{
t('personal_space_module.knowledge_module.upload_document_module.segment_overlap_word_proportion')
}}
</span>
<n-popover trigger="hover" class="max-w-[300px]">
<template #trigger>
<Help theme="outline" size="15" fill="#333" :stroke-width="3" class="mt-[2px] cursor-pointer" />
</template>
<span>
{{
t(
'personal_space_module.knowledge_module.upload_document_module.segment_overlap_word_proportion_tip',
)
}}
</span>
</n-popover>
</div>
</template>
<n-input-number
v-model:value="segmentationConfig.repetitionRate"
:min="0"
:max="25"
:show-button="false"
:placeholder="
t(
'personal_space_module.knowledge_module.upload_document_module.please_input_segment_overlap_word_proportion',
)
"
class="w-full"
>
<template #suffix>
<span class="mr-[-8px] rounded-r-[5px] border px-3.5">%</span>
</template>
</n-input-number>
</n-form-item>
</n-form>
</div>
</div>
</div>
<div class="mt-14 flex justify-end">
......
......@@ -470,6 +470,7 @@ declare namespace I18n {
segment: string
auto_segment: string
custom_segment: string
add_chunk_up_message: string
add_chunk_down_message: string
search_knowledge_chunk_placeholder: string
......@@ -478,6 +479,15 @@ declare namespace I18n {
knowledge_chunk_content_input_rule: string
delete_knowledge_chunk_content_message: string
divide_by_word_count: string
Chinese_comma: string
Chinese_period: string
Chinese_question_mark: string
Chinese_exclamation_mark: string
English_period: string
English_exclamation_mark: string
ellipsis: string
upload_document_module: {
segment_setting: string
data_process: string
......@@ -492,6 +502,16 @@ declare namespace I18n {
upload_error_message: string
default_segment_setting_title: string
default_segment_setting_desc: string
custom_segment_setting_desc: string
segment_identifier: string
segment_identifier_tip: string
please_select_segment_identifier: string
segment_maximum_number_of_words: string
segment_maximum_number_of_words_tip: string
please_input_segment_word_number: string
segment_overlap_word_proportion: string
segment_overlap_word_proportion_tip: string
please_input_segment_overlap_word_proportion: string
data_process_tip_message: string
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment