#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. # #Licensed under the Apache License, Version 2.0 (the "License"); #you may not use this file except in compliance with the License. #You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # #Unless required by applicable law or agreed to in writing, software #distributed under the License is distributed on an "AS IS" BASIS, #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #See the License for the specific language governing permissions and #limitations under the License. import math import cv2 import numpy as np import json class EASTProcessTrain(object): def __init__(self, params): self.img_set_dir = params['img_set_dir'] self.random_scale = np.array([0.5, 1, 2.0, 3.0]) self.background_ratio = params['background_ratio'] self.min_crop_side_ratio = params['min_crop_side_ratio'] image_shape = params['image_shape'] self.input_size = image_shape[1] self.min_text_size = params['min_text_size'] def preprocess(self, im): input_size = self.input_size im_shape = im.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) im_scale = float(input_size) / float(im_size_max) im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale) img_mean = [0.485, 0.456, 0.406] img_std = [0.229, 0.224, 0.225] im = im[:, :, ::-1].astype(np.float32) im = im / 255 im -= img_mean im /= img_std new_h, new_w, _ = im.shape im_padded = np.zeros((input_size, input_size, 3), dtype=np.float32) im_padded[:new_h, :new_w, :] = im im_padded = im_padded.transpose((2, 0, 1)) im_padded = im_padded[np.newaxis, :] return im_padded, im_scale def convert_label_infor(self, label_infor): label_infor = label_infor.decode() label_infor = label_infor.encode('utf-8').decode('utf-8-sig') substr = label_infor.strip("\n").split("\t") img_path = self.img_set_dir + substr[0] label = json.loads(substr[1]) nBox = len(label) wordBBs, txts, txt_tags = [], [], [] for bno in range(0, nBox): wordBB = label[bno]['points'] txt = label[bno]['transcription'] wordBBs.append(wordBB) txts.append(txt) if txt == '###': txt_tags.append(True) else: txt_tags.append(False) wordBBs = np.array(wordBBs, dtype=np.float32) txt_tags = np.array(txt_tags, dtype=np.bool) return img_path, wordBBs, txt_tags, txts def rotate_im_poly(self, im, text_polys): """ rotate image with 90 / 180 / 270 degre """ im_w, im_h = im.shape[1], im.shape[0] dst_im = im.copy() dst_polys = [] rand_degree_ratio = np.random.rand() rand_degree_cnt = 1 if rand_degree_ratio > 0.333 and rand_degree_ratio < 0.666: rand_degree_cnt = 2 elif rand_degree_ratio > 0.666: rand_degree_cnt = 3 for i in range(rand_degree_cnt): dst_im = np.rot90(dst_im) rot_degree = -90 * rand_degree_cnt rot_angle = rot_degree * math.pi / 180.0 n_poly = text_polys.shape[0] cx, cy = 0.5 * im_w, 0.5 * im_h ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0] for i in range(n_poly): wordBB = text_polys[i] poly = [] for j in range(4): sx, sy = wordBB[j][0], wordBB[j][1] dx = math.cos(rot_angle) * (sx - cx)\ - math.sin(rot_angle) * (sy - cy) + ncx dy = math.sin(rot_angle) * (sx - cx)\ + math.cos(rot_angle) * (sy - cy) + ncy poly.append([dx, dy]) dst_polys.append(poly) dst_polys = np.array(dst_polys, dtype=np.float32) return dst_im, dst_polys def polygon_area(self, poly): """ compute area of a polygon :param poly: :return: """ edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]), (poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]), (poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]), (poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])] return np.sum(edge) / 2. def check_and_validate_polys(self, polys, tags, img_height, img_width): """ check so that the text poly is in the same direction, and also filter some invalid polygons :param polys: :param tags: :return: """ h, w = img_height, img_width if polys.shape[0] == 0: return polys polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1) polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1) validated_polys = [] validated_tags = [] for poly, tag in zip(polys, tags): p_area = self.polygon_area(poly) #invalid poly if abs(p_area) < 1: continue if p_area > 0: #'poly in wrong direction' if tag == False: tag = True #reversed cases should be ignore poly = poly[(0, 3, 2, 1), :] validated_polys.append(poly) validated_tags.append(tag) return np.array(validated_polys), np.array(validated_tags) def draw_img_polys(self, img, polys): if len(img.shape) == 4: img = np.squeeze(img, axis=0) if img.shape[0] == 3: img = img.transpose((1, 2, 0)) img[:, :, 2] += 123.68 img[:, :, 1] += 116.78 img[:, :, 0] += 103.94 cv2.imwrite("tmp.jpg", img) img = cv2.imread("tmp.jpg") for box in polys: box = box.astype(np.int32).reshape((-1, 1, 2)) cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2) import random ino = random.randint(0, 100) cv2.imwrite("tmp_%d.jpg" % ino, img) return def shrink_poly(self, poly, r): """ fit a poly inside the origin poly, maybe bugs here... used for generate the score map :param poly: the text poly :param r: r in the paper :return: the shrinked poly """ # shrink ratio R = 0.3 # find the longer pair dist0 = np.linalg.norm(poly[0] - poly[1]) dist1 = np.linalg.norm(poly[2] - poly[3]) dist2 = np.linalg.norm(poly[0] - poly[3]) dist3 = np.linalg.norm(poly[1] - poly[2]) if dist0 + dist1 > dist2 + dist3: # first move (p0, p1), (p2, p3), then (p0, p3), (p1, p2) ## p0, p1 theta = np.arctan2((poly[1][1] - poly[0][1]), (poly[1][0] - poly[0][0])) poly[0][0] += R * r[0] * np.cos(theta) poly[0][1] += R * r[0] * np.sin(theta) poly[1][0] -= R * r[1] * np.cos(theta) poly[1][1] -= R * r[1] * np.sin(theta) ## p2, p3 theta = np.arctan2((poly[2][1] - poly[3][1]), (poly[2][0] - poly[3][0])) poly[3][0] += R * r[3] * np.cos(theta) poly[3][1] += R * r[3] * np.sin(theta) poly[2][0] -= R * r[2] * np.cos(theta) poly[2][1] -= R * r[2] * np.sin(theta) ## p0, p3 theta = np.arctan2((poly[3][0] - poly[0][0]), (poly[3][1] - poly[0][1])) poly[0][0] += R * r[0] * np.sin(theta) poly[0][1] += R * r[0] * np.cos(theta) poly[3][0] -= R * r[3] * np.sin(theta) poly[3][1] -= R * r[3] * np.cos(theta) ## p1, p2 theta = np.arctan2((poly[2][0] - poly[1][0]), (poly[2][1] - poly[1][1])) poly[1][0] += R * r[1] * np.sin(theta) poly[1][1] += R * r[1] * np.cos(theta) poly[2][0] -= R * r[2] * np.sin(theta) poly[2][1] -= R * r[2] * np.cos(theta) else: ## p0, p3 # print poly theta = np.arctan2((poly[3][0] - poly[0][0]), (poly[3][1] - poly[0][1])) poly[0][0] += R * r[0] * np.sin(theta) poly[0][1] += R * r[0] * np.cos(theta) poly[3][0] -= R * r[3] * np.sin(theta) poly[3][1] -= R * r[3] * np.cos(theta) ## p1, p2 theta = np.arctan2((poly[2][0] - poly[1][0]), (poly[2][1] - poly[1][1])) poly[1][0] += R * r[1] * np.sin(theta) poly[1][1] += R * r[1] * np.cos(theta) poly[2][0] -= R * r[2] * np.sin(theta) poly[2][1] -= R * r[2] * np.cos(theta) ## p0, p1 theta = np.arctan2((poly[1][1] - poly[0][1]), (poly[1][0] - poly[0][0])) poly[0][0] += R * r[0] * np.cos(theta) poly[0][1] += R * r[0] * np.sin(theta) poly[1][0] -= R * r[1] * np.cos(theta) poly[1][1] -= R * r[1] * np.sin(theta) ## p2, p3 theta = np.arctan2((poly[2][1] - poly[3][1]), (poly[2][0] - poly[3][0])) poly[3][0] += R * r[3] * np.cos(theta) poly[3][1] += R * r[3] * np.sin(theta) poly[2][0] -= R * r[2] * np.cos(theta) poly[2][1] -= R * r[2] * np.sin(theta) return poly def generate_quad(self, im_size, polys, tags): """ Generate quadrangle. """ h, w = im_size poly_mask = np.zeros((h, w), dtype=np.uint8) score_map = np.zeros((h, w), dtype=np.uint8) # (x1, y1, ..., x4, y4, short_edge_norm) geo_map = np.zeros((h, w, 9), dtype=np.float32) # mask used during traning, to ignore some hard areas training_mask = np.ones((h, w), dtype=np.uint8) for poly_idx, poly_tag in enumerate(zip(polys, tags)): poly = poly_tag[0] tag = poly_tag[1] r = [None, None, None, None] for i in range(4): dist1 = np.linalg.norm(poly[i] - poly[(i + 1) % 4]) dist2 = np.linalg.norm(poly[i] - poly[(i - 1) % 4]) r[i] = min(dist1, dist2) # score map shrinked_poly = self.shrink_poly( poly.copy(), r).astype(np.int32)[np.newaxis, :, :] cv2.fillPoly(score_map, shrinked_poly, 1) cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1) # if the poly is too small, then ignore it during training poly_h = min( np.linalg.norm(poly[0] - poly[3]), np.linalg.norm(poly[1] - poly[2])) poly_w = min( np.linalg.norm(poly[0] - poly[1]), np.linalg.norm(poly[2] - poly[3])) if min(poly_h, poly_w) < self.min_text_size: cv2.fillPoly(training_mask, poly.astype(np.int32)[np.newaxis, :, :], 0) if tag: cv2.fillPoly(training_mask, poly.astype(np.int32)[np.newaxis, :, :], 0) xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1)) # geo map. y_in_poly = xy_in_poly[:, 0] x_in_poly = xy_in_poly[:, 1] poly[:, 0] = np.minimum(np.maximum(poly[:, 0], 0), w) poly[:, 1] = np.minimum(np.maximum(poly[:, 1], 0), h) for pno in range(4): geo_channel_beg = pno * 2 geo_map[y_in_poly, x_in_poly, geo_channel_beg] =\ x_in_poly - poly[pno, 0] geo_map[y_in_poly, x_in_poly, geo_channel_beg+1] =\ y_in_poly - poly[pno, 1] geo_map[y_in_poly, x_in_poly, 8] = \ 1.0 / max(min(poly_h, poly_w), 1.0) return score_map, geo_map, training_mask def crop_area(self, im, polys, tags, txts, crop_background=False, max_tries=50): """ make random crop from the input image :param im: :param polys: :param tags: :param crop_background: :param max_tries: :return: """ h, w, _ = im.shape pad_h = h // 10 pad_w = w // 10 h_array = np.zeros((h + pad_h * 2), dtype=np.int32) w_array = np.zeros((w + pad_w * 2), dtype=np.int32) for poly in polys: poly = np.round(poly, decimals=0).astype(np.int32) minx = np.min(poly[:, 0]) maxx = np.max(poly[:, 0]) w_array[minx + pad_w:maxx + pad_w] = 1 miny = np.min(poly[:, 1]) maxy = np.max(poly[:, 1]) h_array[miny + pad_h:maxy + pad_h] = 1 # ensure the cropped area not across a text h_axis = np.where(h_array == 0)[0] w_axis = np.where(w_array == 0)[0] if len(h_axis) == 0 or len(w_axis) == 0: return im, polys, tags, txts for i in range(max_tries): xx = np.random.choice(w_axis, size=2) xmin = np.min(xx) - pad_w xmax = np.max(xx) - pad_w xmin = np.clip(xmin, 0, w - 1) xmax = np.clip(xmax, 0, w - 1) yy = np.random.choice(h_axis, size=2) ymin = np.min(yy) - pad_h ymax = np.max(yy) - pad_h ymin = np.clip(ymin, 0, h - 1) ymax = np.clip(ymax, 0, h - 1) if xmax - xmin < self.min_crop_side_ratio * w or \ ymax - ymin < self.min_crop_side_ratio * h: # area too small continue if polys.shape[0] != 0: poly_axis_in_area = (polys[:, :, 0] >= xmin)\ & (polys[:, :, 0] <= xmax)\ & (polys[:, :, 1] >= ymin)\ & (polys[:, :, 1] <= ymax) selected_polys = np.where( np.sum(poly_axis_in_area, axis=1) == 4)[0] else: selected_polys = [] if len(selected_polys) == 0: # no text in this area if crop_background: im = im[ymin:ymax + 1, xmin:xmax + 1, :] polys = [] tags = [] txts = [] return im, polys, tags, txts else: continue im = im[ymin:ymax + 1, xmin:xmax + 1, :] polys = polys[selected_polys] tags = tags[selected_polys] txts_tmp = [] for selected_poly in selected_polys: txts_tmp.append(txts[selected_poly]) txts = txts_tmp polys[:, :, 0] -= xmin polys[:, :, 1] -= ymin return im, polys, tags, txts return im, polys, tags, txts def crop_background_infor(self, im, text_polys, text_tags, text_strs): im, text_polys, text_tags, text_strs = self.crop_area( im, text_polys, text_tags, text_strs, crop_background=True) if len(text_polys) > 0: return None # pad and resize image input_size = self.input_size im, ratio = self.preprocess(im) score_map = np.zeros((input_size, input_size), dtype=np.float32) geo_map = np.zeros((input_size, input_size, 9), dtype=np.float32) training_mask = np.ones((input_size, input_size), dtype=np.float32) return im, score_map, geo_map, training_mask def crop_foreground_infor(self, im, text_polys, text_tags, text_strs): im, text_polys, text_tags, text_strs = self.crop_area( im, text_polys, text_tags, text_strs, crop_background=False) if text_polys.shape[0] == 0: return None #continue for all ignore case if np.sum((text_tags * 1.0)) >= text_tags.size: return None # pad and resize image input_size = self.input_size im, ratio = self.preprocess(im) text_polys[:, :, 0] *= ratio text_polys[:, :, 1] *= ratio _, _, new_h, new_w = im.shape # print(im.shape) # self.draw_img_polys(im, text_polys) score_map, geo_map, training_mask = self.generate_quad( (new_h, new_w), text_polys, text_tags) return im, score_map, geo_map, training_mask def __call__(self, label_infor): infor = self.convert_label_infor(label_infor) im_path, text_polys, text_tags, text_strs = infor im = cv2.imread(im_path) if im is None: return None if text_polys.shape[0] == 0: return None #add rotate cases if np.random.rand() < 0.5: im, text_polys = self.rotate_im_poly(im, text_polys) h, w, _ = im.shape text_polys, text_tags = self.check_and_validate_polys(text_polys, text_tags, h, w) if text_polys.shape[0] == 0: return None # random scale this image rd_scale = np.random.choice(self.random_scale) im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale) text_polys *= rd_scale if np.random.rand() < self.background_ratio: outs = self.crop_background_infor(im, text_polys, text_tags, text_strs) else: outs = self.crop_foreground_infor(im, text_polys, text_tags, text_strs) if outs is None: return None im, score_map, geo_map, training_mask = outs score_map = score_map[np.newaxis, ::4, ::4].astype(np.float32) geo_map = np.swapaxes(geo_map, 1, 2) geo_map = np.swapaxes(geo_map, 1, 0) geo_map = geo_map[:, ::4, ::4].astype(np.float32) training_mask = training_mask[np.newaxis, ::4, ::4] training_mask = training_mask.astype(np.float32) return im, score_map, geo_map, training_mask class EASTProcessTest(object): def __init__(self, params): super(EASTProcessTest, self).__init__() if 'max_side_len' in params: self.max_side_len = params['max_side_len'] else: self.max_side_len = 2400 def resize_image(self, im): """ resize image to a size multiple of 32 which is required by the network :param im: the resized image :param max_side_len: limit of max image size to avoid out of memory in gpu :return: the resized image and the resize ratio """ max_side_len = self.max_side_len h, w, _ = im.shape resize_w = w resize_h = h # limit the max side if max(resize_h, resize_w) > max_side_len: if resize_h > resize_w: ratio = float(max_side_len) / resize_h else: ratio = float(max_side_len) / resize_w else: ratio = 1. resize_h = int(resize_h * ratio) resize_w = int(resize_w * ratio) if resize_h % 32 == 0: resize_h = resize_h elif resize_h // 32 <= 1: resize_h = 32 else: resize_h = (resize_h // 32 - 1) * 32 if resize_w % 32 == 0: resize_w = resize_w elif resize_w // 32 <= 1: resize_w = 32 else: resize_w = (resize_w // 32 - 1) * 32 im = cv2.resize(im, (int(resize_w), int(resize_h))) ratio_h = resize_h / float(h) ratio_w = resize_w / float(w) return im, (ratio_h, ratio_w) def __call__(self, im): im, (ratio_h, ratio_w) = self.resize_image(im) img_mean = [0.485, 0.456, 0.406] img_std = [0.229, 0.224, 0.225] im = im[:, :, ::-1].astype(np.float32) im = im / 255 im -= img_mean im /= img_std im = im.transpose((2, 0, 1)) im = im[np.newaxis, :] return [im, (ratio_h, ratio_w)]