2016-12-21 10 views
0

Приносим извинения за этот большой блок кода. Это самый сжатый способ предоставить повторяющийся рабочий пример.Sklearn - FeatureUnion - Transformer: TypeError: fit_transform() принимает 2 позиционных аргумента, но 3 дано

В коде я пытаюсь использовать FeatureUnion для преобразования двух столбцов из dataframe, где один столбец текстовые данные, так TfidfVectorizer, а другой представляет собой столбец списка тегов, так что я хочу использовать MultiLabelBinarizer.

ItemSelector трансформатор предназначен для выбора правильного столбца из блока данных.

Почему я получаю TypeError: fit_transform() takes 2 positional arguments but 3 were given?

Что мне нужно изменить в коде, чтобы этот пример работал правильно?

from sklearn.preprocessing import MultiLabelBinarizer 
from sklearn.base import TransformerMixin, BaseEstimator 
from sklearn.pipeline import Pipeline, FeatureUnion 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.model_selection import GridSearchCV 
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.linear_model import SGDClassifier 

import pandas as pd 
import numpy as np 

d = {'label': ['Help', 'Help', 'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 
      'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 
      'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 
      'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 'Other'], 
    'multilabels': ["['Samples']", "['Deck']", "['Deck', 'Deck Over', 'Stain']", 
        "['Coupons']", "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']"], 
    'response': ['this is some text', 'this is some more text', 
        'and here is some more', 'and some more', 
        'and here we go some more yay done', 'this is some text', 
        'this is some more text', 'and here is some more', 
        'and some more', 'and here we go some more yay done', 
        'this is some text', 'this is some more text', 
        'and here is some more', 'and some more', 
        'and here we go some more yay done', 'this is some text', 
        'this is some more text', 'and here is some more', 
        'and some more', 'and here we go some more yay done', 
        'this is some text', 'this is some more text', 
        'and here is some more', 'and some more', 
        'and here we go some more yay done']} 

class ItemSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, key): 
    self.key = key 

    def fit(self, X, y=None): 
    return self 

    def transform(self, df): 
    return df[self.key] 

feature_union = FeatureUnion(
    transformer_list=[ 
    ('step1', Pipeline([ 
     ('selector', ItemSelector(key='response')), 
     ('tfidf', TfidfVectorizer()), 
     ])), 
    ('step2', Pipeline([ 
     ('selector', ItemSelector(key='multilabels')), 
     ('multilabel', MultiLabelBinarizer()) 
     ])) 
    ]) 

pipeline = OneVsRestClassifier(
    Pipeline([('union', feature_union),('sgd', SGDClassifier())]) 
) 

grid = GridSearchCV(pipeline, {}, verbose=5) 

df = pd.DataFrame(d, columns=['response', 'multilabels', 'label']) 
X = df[['response', 'multilabels']] 
y = df['label'] 
grid.fit(X, y) 

Это полная ошибка:

Traceback (most recent call last): 
    File "C:/Users/owner/Documents/my files/Account Tracking/Client/Foresee Analysis/SOQuestion.py", line 72, in <module> 
    grid.fit(X, y) 
    File "C:\Python34\lib\site-packages\sklearn\model_selection\_search.py", line 945, in fit 
    return self._fit(X, y, groups, ParameterGrid(self.param_grid)) 
    File "C:\Python34\lib\site-packages\sklearn\model_selection\_search.py", line 564, in _fit 
    for parameters in parameter_iterable 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__ 
    while self.dispatch_one_batch(iterator): 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch 
    self._dispatch(tasks) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch 
    job = self._backend.apply_async(batch, callback=cb) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async 
    result = ImmediateResult(func) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__ 
    self.results = batch() 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp> 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "C:\Python34\lib\site-packages\sklearn\model_selection\_validation.py", line 238, in _fit_and_score 
    estimator.fit(X_train, y_train, **fit_params) 
    File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 216, in fit 
    for i, column in enumerate(columns)) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__ 
    while self.dispatch_one_batch(iterator): 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch 
    self._dispatch(tasks) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch 
    job = self._backend.apply_async(batch, callback=cb) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async 
    result = ImmediateResult(func) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__ 
    self.results = batch() 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp> 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 80, in _fit_binary 
    estimator.fit(X, y) 
    File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 268, in fit 
    Xt, fit_params = self._fit(X, y, **fit_params) 
    File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 234, in _fit 
    Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 
    File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 734, in fit_transform 
    for name, trans, weight in self._iter()) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 758, in __call__ 
    while self.dispatch_one_batch(iterator): 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 608, in dispatch_one_batch 
    self._dispatch(tasks) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 571, in _dispatch 
    job = self._backend.apply_async(batch, callback=cb) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 109, in apply_async 
    result = ImmediateResult(func) 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 326, in __init__ 
    self.results = batch() 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp> 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 577, in _fit_transform_one 
    res = transformer.fit_transform(X, y, **fit_params) 
    File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 303, in fit_transform 
    return last_step.fit_transform(Xt, y, **fit_params) 
TypeError: fit_transform() takes 2 positional arguments but 3 were given 

Примечание: Я посмотрел на _transform() takes 2 positional arguments but 3 were given, но она по-прежнему не имеет смысла для меня.

ответ

0

Поставлено. Сделал другой трансформатор для работы с бинаризацией с несколькими метками. Это больше похоже на работу, а не на решение, поскольку бинаризация происходит внутри преобразования вместо конвейера.

from sklearn.preprocessing import MultiLabelBinarizer 
from sklearn.base import TransformerMixin, BaseEstimator 
from sklearn.pipeline import Pipeline, FeatureUnion 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.model_selection import GridSearchCV 
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.linear_model import SGDClassifier 

import pandas as pd 
import numpy as np 

d = {'label': ['Help', 'Help', 'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 
      'Other', 'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 
      'Sale/Coupon', 'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 
      'Other', 'Help', 'Help', 'Other', 'Sale/Coupon', 'Other'], 
    'multilabels': ["['Samples']", "['Deck']", "['Deck', 'Deck Over', 'Stain']", 
        "['Coupons']", "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']", "['Samples']", "['Deck']", 
        "['Deck', 'Deck Over', 'Stain']", "['Coupons']", 
        "['Bathroom']"], 
    'response': ['this is some text', 'this is some more text', 
        'and here is some more', 'and some more', 
        'and here we go some more yay done', 'this is some text', 
        'this is some more text', 'and here is some more', 
        'and some more', 'and here we go some more yay done', 
        'this is some text', 'this is some more text', 
        'and here is some more', 'and some more', 
        'and here we go some more yay done', 'this is some text', 
        'this is some more text', 'and here is some more', 
        'and some more', 'and here we go some more yay done', 
        'this is some text', 'this is some more text', 
        'and here is some more', 'and some more', 
        'and here we go some more yay done']} 

class ItemSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, column): 
    self.column = column 

    def fit(self, X, y=None, **fit_params): 
    return self 

    def transform(self, X, y=None, **fit_params): 
    return X[self.column] 

class MultiLabelTransformer(BaseEstimator, TransformerMixin): 

    def __init__(self, column): 
    self.column = column 

    def fit(self, X, y=None): 
    return self 

    def transform(self, X): 
    mlb = MultiLabelBinarizer() 
    return mlb.fit_transform(X[self.column]) 

pipeline = OneVsRestClassifier(
    Pipeline([ 
    ('union', FeatureUnion(
    transformer_list=[ 
     ('step1', Pipeline([ 
     ('selector', ItemSelector(column='response')), 
     ('tfidf', TfidfVectorizer()) 
     ])), 
     ('step2', Pipeline([ 
     ('selector', MultiLabelTransformer(column='multilabels')) 
     ])) 
     ])), 
    ('sgd', SGDClassifier()) 
    ]) 
) 

grid = GridSearchCV(pipeline, {}, verbose=5) 

df = pd.DataFrame(d, columns=['response', 'multilabels', 'label']) 
df['multilabels'] = df['multilabels'].apply(lambda s: eval(s)) 
X = df[['response', 'multilabels']] 
y = df['label'] 
grid.fit(X, y) 

 Смежные вопросы

  • Нет связанных вопросов^_^