Posts

Showing posts from March, 2017

Create a custom Transformer in PySpark ML

Via: http://stackoverflow.com/questions/32331848/create-a-custom-transformer-in-pyspark-ml import nltk from pyspark import keyword_only ## < 2.0 -> pyspark.ml.util.keyword_only from pyspark . ml import Transformer from pyspark . ml . param . shared import HasInputCol , HasOutputCol , Param from pyspark . sql . functions import udf from pyspark . sql . types import ArrayType , StringType class NLTKWordPunctTokenizer ( Transformer , HasInputCol , HasOutputCol ): @keyword_only def __init__ ( self , inputCol = None , outputCol = None , stopwords = None ): super ( NLTKWordPunctTokenizer , self ). __init__ () self . stopwords = Param ( self , "stopwords" , "" ) self . _setDefault ( stopwords = set ()) kwargs = self . __init__ . _input_kwargs self . setParams (** kwargs ) @keyword_only def setParams ( self , inputCol = None , outputCol = None , stopwords = None ):