# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Tokenizer class. """ from __future__ import absolute_import, division, print_function, unicode_literals import collections import json import logging import os import sys import unicodedata def clean_string(string): replace_mp = { " - ": "-", " ' ": "'", " n't": "n't", " 'm": "'m", " do not": " don't", " 's": "'s", " 've": "'ve", " 're": "'re" } for k, v in replace_mp.items(): string = string.replace(k, v) return string def decode(string): string = clean_string(string.decode("utf8")) return string.encode("utf8") for line in sys.stdin: print(decode(line.strip()))