提交 cd3eedc5 编写于 作者: ToTensor's avatar ToTensor

extract success

上级 1ac1d443
...@@ -3,10 +3,13 @@ from src.ebook.community import send_topic ...@@ -3,10 +3,13 @@ from src.ebook.community import send_topic
if __name__ == "__main__": if __name__ == "__main__":
book_mapping = { book_mapping = {
"前端体验设计": "c4eeb42b07f54b42a9fd1568b8ec4b98", "前端体验设计——HTML5+CSS3终极修炼": "c4eeb42b07f54b42a9fd1568b8ec4b98",
} }
extract_code(book_mapping) for key in book_mapping.keys():
web_url = 'https://gitcode.net/csdn/content/book_id_c4eeb42b07f54b42a9fd1568b8ec4b98/-/tree/master/' extract_code(book_mapping)
print('-------' * 20) web_url = 'https://gitcode.net/csdn/content/book_id_{}/-/tree/master/'.format(
print('开始向社区发帖') book_mapping[key])
# send_topic(web_url) print('-------' * 20)
\ No newline at end of file print('开始向社区发帖')
book_dir = 'data/{}/'.format(key)
# send_topic(web_url, book_dir)
\ No newline at end of file
...@@ -65,9 +65,8 @@ def post(url, params, retry=3, headers=None): ...@@ -65,9 +65,8 @@ def post(url, params, retry=3, headers=None):
raise error raise error
def send_topic(web_url): def send_topic(web_url, book_dir):
data_dir = 'data' data_dir = 'data'
book_dir = 'data/深入剖析Nginx/'
# web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/" # web_url = "https://codechina.csdn.net/csdn/book_code_c798a5992a654857867ec15660e1c32a/-/blob/master/"
request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic' request_url = 'http://ccloud.internal.csdn.net/v1/internal/community/content/sendTopic'
......
...@@ -57,103 +57,102 @@ def extract_code(book_mapping): ...@@ -57,103 +57,102 @@ def extract_code(book_mapping):
chapter_content = html.unescape(chapter_content) chapter_content = html.unescape(chapter_content)
# print(chapter_content) # print(chapter_content)
if book_name == "前端体验设计": section_list = re.findall(r'<h2.*?>(.*?)</h2>',
section_list = re.findall(r'<h2.*?>(.*?)</h2>', chapter_content,
chapter_content, flags=re.S)
flags=re.S)
print(section_list) print(section_list)
section_content_list = re.split(r'<h2.*?>.*?</h2>', section_content_list = re.split(r'<h2.*?>.*?</h2>',
chapter_content, chapter_content,
flags=re.S) flags=re.S)
section_dir_list = [] section_dir_list = []
for idx, section in enumerate(section_list): for idx, section in enumerate(section_list):
section = section.replace(' ', ' ') section = section.replace(' ', ' ')
if section.find(r'/') != -1: if section.find(r'/') != -1:
section = section.replace('/', '') section = section.replace('/', '')
section_dir = os.path.join( section_dir = os.path.join(chapter_dir,
chapter_dir, '{}.{}'.format(idx + 1, section)) '{}.{}'.format(idx + 1, section))
print(section_dir) print(section_dir)
if not os.path.exists(section_dir): if not os.path.exists(section_dir):
os.mkdir(section_dir) os.mkdir(section_dir)
section_dir_list.append(section_dir) section_dir_list.append(section_dir)
for idx, section_content in enumerate(section_content_list): for idx, section_content in enumerate(section_content_list):
if idx == 0: if idx == 0:
html_save_path = os.path.join(chapter_dir, 'text.html') html_save_path = os.path.join(chapter_dir, 'text.html')
else: else:
html_save_path = os.path.join( html_save_path = os.path.join(section_dir_list[idx - 1],
section_dir_list[idx - 1], 'text.html') 'text.html')
# with open(html_save_path, 'w', encoding='utf-8') as f: # with open(html_save_path, 'w', encoding='utf-8') as f:
# f.write(section_content) # f.write(section_content)
code_list = re.findall(r'<code>(.*?)</code>', code_list = re.findall(r'<code>(.*?)</code>', section_content,
section_content, re.S) re.S)
res_codelist = [] res_codelist = []
for code in code_list: for code in code_list:
code = code.strip() code = code.strip()
if code != '': if code != '':
res_codelist.append(code) res_codelist.append(code)
# print(res_codelist) # print(res_codelist)
# break # break
count = 0 count = 0
for code in res_codelist: for code in res_codelist:
if len(code.split('\n')) < 2: if len(code.split('\n')) < 2:
continue continue
# code = html.unescape(code) # code = html.unescape(code)
# soup = BeautifulSoup(code) # soup = BeautifulSoup(code)
# clean_code = soup.get_text() # clean_code = soup.get_text()
# print(clean_code) # print(clean_code)
# print('-------' * 10) # print('-------' * 10)
# pianduan_name = re.findall(r'(代码片段.*),', clean_code) # pianduan_name = re.findall(r'(代码片段.*),', clean_code)
# if pianduan_name == []: # if pianduan_name == []:
# pianduan_name_str = '' # pianduan_name_str = ''
# else: # else:
# pianduan_name_str = pianduan_name[0] # pianduan_name_str = pianduan_name[0]
# file_name_list = re.findall(r'文件名: (.*)\n', clean_code) # file_name_list = re.findall(r'文件名: (.*)\n', clean_code)
# print(file_name_list) # print(file_name_list)
# if file_name_list == []: # if file_name_list == []:
# file_name = '.txt' # file_name = '.txt'
# else: # else:
# file_name = file_name_list[0] # file_name = file_name_list[0]
# file_name = file_name.replace('/', '-') # file_name = file_name.replace('/', '-')
# save_file_name = pianduan_name_str + '-' + file_name # save_file_name = pianduan_name_str + '-' + file_name
# print(save_file_name) # print(save_file_name)
if idx == 0: if idx == 0:
code_save_path = os.path.join( code_save_path = os.path.join(chapter_dir,
chapter_dir, 'code_0.css') 'code_0.css')
else: else:
count += 1 count += 1
code_save_path = os.path.join( code_save_path = os.path.join(
section_dir_list[idx - 1], section_dir_list[idx - 1],
'code_{}.css'.format(count)) 'code_{}.css'.format(count))
# res_code_list = [] # res_code_list = []
# for line in clean_code.split('\n'): # for line in clean_code.split('\n'):
# if line.find('文件名') != -1 or line.find( # if line.find('文件名') != -1 or line.find(
# '代码片段') != -1 or line == '': # '代码片段') != -1 or line == '':
# continue # continue
# clean_line = re.findall(r'^\d{1,5}\: *(.*)', # clean_line = re.findall(r'^\d{1,5}\: *(.*)',
# line)[0] # line)[0]
# res_code_list.append(clean_line) # res_code_list.append(clean_line)
# res_code = '\n'.join(res_code_list) # res_code = '\n'.join(res_code_list)
with open(code_save_path, 'w', encoding='utf-8') as f: with open(code_save_path, 'w', encoding='utf-8') as f:
f.write(code) f.write(code)
# clean_text_list = [] # clean_text_list = []
# for line in res_str.split('\n'): # for line in res_str.split('\n'):
# if line == '': # if line == '':
# continue # continue
# if line[0].isdigit(): # if line[0].isdigit():
# line = re.findall(r'^[0-9]+ {0,2}(.*)', # line = re.findall(r'^[0-9]+ {0,2}(.*)',
# line)[0] # line)[0]
# # print(line) # # print(line)
# else: # else:
# if line.startswith('>>'): # if line.startswith('>>'):
# break # break
# clean_text_list.append(line) # clean_text_list.append(line)
# clean_code = '\n'.join(clean_text_list) # clean_code = '\n'.join(clean_text_list)
# print(clean_code) # print(clean_code)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册