article.go 8.5 KB
Newer Older
1 2 3
package service

import (
aaronchen2k2k's avatar
aaronchen2k2k 已提交
4
	"fmt"
5 6 7
	"github.com/easysoft/zendata/src/model"
	constant "github.com/easysoft/zendata/src/utils/const"
	fileUtils "github.com/easysoft/zendata/src/utils/file"
aaronchen2k2k's avatar
aaronchen2k2k 已提交
8
	stringUtils "github.com/easysoft/zendata/src/utils/string"
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
	_ "github.com/mattn/go-sqlite3"
	"gopkg.in/yaml.v3"
	"path"
	"path/filepath"
	"strconv"
	"strings"
)

const (
	strLeft = "“"
	strRight = "”"

	expLeft = "("
	expRight = ")"

	table = "words.v1"
)
var (
	compares = []string{"=", "!=", ">", "<"}
)

func ConvertArticle(src, dist string) {
	files := make([]string, 0)
aaronchen2k2k's avatar
aaronchen2k2k 已提交
32
	if !fileUtils.IsDir(src) { //  file
33 34 35
		pth, _ := filepath.Abs(src)
		files = append(files, pth)

aaronchen2k2k's avatar
aaronchen2k2k 已提交
36
		if dist == "" { dist = fileUtils.AddSepIfNeeded(path.Dir(pth)) }
37 38
	} else {
		fileUtils.GetFilesInDir(src, ".txt", &files)
aaronchen2k2k's avatar
aaronchen2k2k 已提交
39
		if dist == "" { dist = fileUtils.AddSepIfNeeded(src) }
40 41 42
	}

	for _, filePath := range files {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
43
		yamlPaths := convertSentYaml(filePath, dist)
aaronchen2k2k's avatar
aaronchen2k2k 已提交
44
		convertMainYaml(yamlPaths, filePath, dist)
45 46 47
	}
}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
48 49
func convertSentYaml(filePath, dist string) (yamlPaths []string) {
	article := fileUtils.ReadFile(filePath)
50
	sections := parseSections(article)
aaronchen2k2k's avatar
aaronchen2k2k 已提交
51
	paragraphs := groupSections(sections)
52

aaronchen2k2k's avatar
aaronchen2k2k 已提交
53
	for paragIndex, parag := range paragraphs {
54

aaronchen2k2k's avatar
aaronchen2k2k 已提交
55 56
		for sentIndex, sent := range parag {
			fileSeq := fmt.Sprintf("p%02d-s%02d", paragIndex + 1, sentIndex + 1)
57

aaronchen2k2k's avatar
aaronchen2k2k 已提交
58
			conf := createDef(constant.ConfigTypeArticle, table, fileUtils.GetRelatPath(filePath))
59

aaronchen2k2k's avatar
aaronchen2k2k 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
			prefix := ""
			for sectIndex, sect := range sent { // each sent saved as a yaml file
				fieldSeq := fmt.Sprintf("%d-%d-%d", paragIndex + 1, sentIndex + 1, sectIndex + 1)
				if sect.Type == "exp" {
					fields := createFields(fieldSeq, prefix, sect.Val)
					conf.XFields = append(conf.XFields, fields...)

					prefix = ""
				} else {
					prefix += sect.Val

					if prefix != "" && sectIndex == len(sent) - 1 { // last section
						field := model.DefFieldExport{Field: fieldSeq, Prefix: prefix}
						conf.XFields = append(conf.XFields, field)
						prefix = ""
					}
				}
aaronchen2k2k's avatar
aaronchen2k2k 已提交
77
			}
aaronchen2k2k's avatar
aaronchen2k2k 已提交
78 79

			bytes, _ := yaml.Marshal(&conf)
aaronchen2k2k's avatar
aaronchen2k2k 已提交
80
			content := stringUtils.ConvertYamlStringToMapFormat(bytes)
aaronchen2k2k's avatar
aaronchen2k2k 已提交
81 82 83 84 85 86
			content = strings.Replace(content, "xfields", "\nfields", -1)

			yamlPath := fileUtils.AddSepIfNeeded(dist) +
				fileUtils.ChangeFileExt(path.Base(filePath), "-") + fileSeq + ".yaml"
			fileUtils.WriteFile(yamlPath, content)

aaronchen2k2k's avatar
aaronchen2k2k 已提交
87
			yamlPaths = append(yamlPaths, yamlPath)
88 89 90
		}
	}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
91 92 93 94 95 96 97
	return
}

func convertMainYaml(yamlPaths []string, filePath, dist string) {
	conf := createArticle(constant.ConfigTypeArticle, fileUtils.GetRelatPath(filePath))

	for index, file := range yamlPaths {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
98 99
		path := strings.TrimPrefix(file, dist)
		field := model.ArticleField{Field: strconv.Itoa(index + 1), Range: path}
aaronchen2k2k's avatar
aaronchen2k2k 已提交
100 101 102
		conf.XFields = append(conf.XFields, field)
	}

103
	bytes, _ := yaml.Marshal(&conf)
aaronchen2k2k's avatar
aaronchen2k2k 已提交
104
	content := string(bytes)
105 106 107 108 109 110 111 112

	// convert yaml format by using a map
	m := make(map[string]interface{})
	yaml.Unmarshal([]byte(content), &m)
	bytes, _ = yaml.Marshal(&m)
	content = string(bytes)
	content = strings.Replace(content, "xfields", "\nfields", -1)

aaronchen2k2k's avatar
aaronchen2k2k 已提交
113 114 115 116 117
	yamlPath := fileUtils.AddSepIfNeeded(dist) + fileUtils.ChangeFileExt(path.Base(filePath), ".yaml")
	fileUtils.WriteFile(yamlPath, content)

	relatPath := fileUtils.GetRelatPath(yamlPath)
	yamlPaths = append(yamlPaths, relatPath)
118 119 120
}

func createDef(typ, table, filePath string) (conf model.DefExport) {
121
	conf.Title = "文章句子"
122
	conf.Author = "ZenData"
aaronchen2k2k's avatar
aaronchen2k2k 已提交
123
	conf.Type = typ
124
	conf.Desc = "Generated from " + filePath
aaronchen2k2k's avatar
aaronchen2k2k 已提交
125 126 127 128 129 130 131 132 133

	if table != "" {
		conf.From = table
	}

	return
}

func createArticle(typ, filePath string) (conf model.Article) {
134
	conf.Title = "文章主文件"
aaronchen2k2k's avatar
aaronchen2k2k 已提交
135
	conf.Author = "ZenData"
136
	conf.Type = typ
137
	conf.Desc = "Generated from " + filePath
138 139 140 141

	return
}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
142
func createFields(seq string, prefix, exp string) (fields []model.DefFieldExport) {
143
	field := model.DefFieldExport{}
aaronchen2k2k's avatar
aaronchen2k2k 已提交
144
	field.Field = seq
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
	field.Prefix = prefix
	field.Rand = true
	field.Limit = 1

	// deal with exp like S:名词-姓+名词-名字=F
	exp = strings.ToLower(strings.TrimSpace(exp))
	expArr := []rune(exp)

	if string(expArr[0]) == "s" && (string(expArr[1]) == ":" || string(expArr[1]) == ":") {
		exp = string(expArr[2:])
		expArr = expArr[2:]
	}

	if strings.Index(exp, "=") == len(exp) - 2 {
		exp = string(expArr[:len(expArr) - 2])
aaronchen2k2k's avatar
aaronchen2k2k 已提交
160 161
		field.Select = exp
		field.Where = string(expArr[len(expArr) - 1])
162
	} else {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
163 164
		field.Select = exp
		field.Where = ""
165 166 167 168 169 170 171 172 173 174
	}

	if strings.Index(field.Select, "+") < 0 {
		fields = append(fields, field)
	} else if strings.Index(field.Select, "+") > 0 { // include more than one field, split to two
		items := strings.Split(field.Select, "+")
		for _, item := range items {
			var objClone interface{} = field
			fieldClone := objClone.(model.DefFieldExport)
			fieldClone.Select = item
aaronchen2k2k's avatar
aaronchen2k2k 已提交
175
			fieldClone.Where = field.Where
176 177 178 179 180 181 182 183

			fields = append(fields, fieldClone)
		}
	}

	return
}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
184
func parseSections(content string) (sections []model.ArticleSent) {
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
	strStart := false
	expStart := false

	content = strings.TrimSpace(content)
	runeArr := []rune(content)

	section := ""
	for i := 0; i < len(runeArr); i++ {
		item := runeArr[i]
		str := string(item)

		isCouple, duplicateStr := isCouple(i, runeArr)
		if isCouple {
			section += duplicateStr
			i += 1
aaronchen2k2k's avatar
aaronchen2k2k 已提交
200 201 202 203 204

			if i == len(runeArr) - 1 {
				addSection(section, "str", &sections)
			}

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
		} else if strStart && str == strRight { // str close
			addSection(section, "str", &sections)

			strStart = false
			section = ""
		} else if expStart && str == expRight { // exp close
			addSection(section, "exp", &sections)

			expStart = false
			section = ""
		} else if !strStart && !expStart && str == strLeft { // str start
			if section != "" && strings.TrimSpace(section) != "+" {
				addSection(section, "str", &sections)
			}

			strStart = true
			section = ""
		} else if !strStart && !expStart && str == expLeft { // exp start
			if section != "" && strings.TrimSpace(section) != "+" {
				addSection(section, "str", &sections)
			}

			expStart = true
			section = ""
		} else {
			section += str
aaronchen2k2k's avatar
aaronchen2k2k 已提交
231

aaronchen2k2k's avatar
aaronchen2k2k 已提交
232
			if str == "。" {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
233
				if i < len(runeArr) - 1 && string(runeArr[i+1]) == strRight {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
234
					i += 1
aaronchen2k2k's avatar
aaronchen2k2k 已提交
235
					strStart = false
aaronchen2k2k's avatar
aaronchen2k2k 已提交
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
				}

				addSection(section, "str", &sections)

				expStart = false
				section = ""
			} else if str == "\n" {
				// get all \n
				for j := i+1; j < len(runeArr); j++ {
					if string(runeArr[j]) == "\n" {
						section += str
						i = j
					} else {
						break
					}
				}

				addSection(section, "str", &sections)

				strStart = false
				expStart = false
				section = ""
			} else if i == len(runeArr) - 1 {
				addSection(section, "str", &sections)
			}
		}
262 263 264 265 266
	}

	return
}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
267 268 269
func groupSections(sectionArr []model.ArticleSent) (paragraphs [][][]model.ArticleSent) {
	sections := make([]model.ArticleSent, 0)
	sentences := make([][]model.ArticleSent, 0)
270

aaronchen2k2k's avatar
aaronchen2k2k 已提交
271 272 273
	for index := 0; index < len(sectionArr); index++ {
		section := sectionArr[index]
		sections = append(sections, section)
274

aaronchen2k2k's avatar
aaronchen2k2k 已提交
275
		if section.IsParag || index == len(sectionArr) - 1 {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
276 277
			sentences = append(sentences, sections)
			paragraphs = append(paragraphs, sentences)
278

aaronchen2k2k's avatar
aaronchen2k2k 已提交
279 280 281 282 283 284 285
			sentences = make([][]model.ArticleSent, 0)
			sections = make([]model.ArticleSent, 0)
		} else if section.IsSent {
			if index < len(sectionArr) - 1 && sectionArr[index+1].IsParag {
				sections = append(sections, sectionArr[index+1])
				sentences = append(sentences, sections)
				paragraphs = append(paragraphs, sentences)
286

aaronchen2k2k's avatar
aaronchen2k2k 已提交
287 288
				sections = make([]model.ArticleSent, 0)
				sentences = make([][]model.ArticleSent, 0)
289 290 291

				index += 1
			} else {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
292 293 294
				sentences = append(sentences, sections)
				if index == len(sectionArr) - 1 {
					paragraphs = append(paragraphs, sentences)
295 296
				}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
297
				sections = make([]model.ArticleSent, 0)
298 299 300 301 302 303 304 305
			}

		}
	}

	return
}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
306 307 308 309
func addSection(str, typ string, arr *[]model.ArticleSent) {
	sent := model.ArticleSent{}
	sent.Type = typ
	sent.Val = str
310

aaronchen2k2k's avatar
aaronchen2k2k 已提交
311 312 313
	runeArr := []rune(str)
	end := runeArr[len(runeArr) - 1]
	if string(end) == "\n" {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
314
		sent.IsParag = true
aaronchen2k2k's avatar
aaronchen2k2k 已提交
315
	} else if string(end) == "。" {
aaronchen2k2k's avatar
aaronchen2k2k 已提交
316
		sent.IsSent = true
aaronchen2k2k's avatar
aaronchen2k2k 已提交
317 318
	}

aaronchen2k2k's avatar
aaronchen2k2k 已提交
319
	*arr = append(*arr, sent)
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
}

func isCouple(i int, arr []rune) (isCouple bool, duplicateStr string) {
	if string(arr[i]) == strLeft && (i + 1 < len(arr) && string(arr[i + 1]) == strLeft) {
		isCouple = true
		duplicateStr = string(arr[i])
	} else if string(arr[i]) == strRight && (i + 1 < len(arr) && string(arr[i + 1]) == strRight) {
		isCouple = true
		duplicateStr = string(arr[i])
	} else if string(arr[i]) == expLeft && (i + 1 < len(arr) && string(arr[i + 1]) == expLeft) {
		isCouple = true
		duplicateStr = string(arr[i])
	} else if string(arr[i]) == expRight && (i + 1 < len(arr) && string(arr[i + 1]) == expRight) {
		isCouple = true
		duplicateStr = string(arr[i])
	}

	return
}