Quantcast
Viewing all articles
Browse latest Browse all 3

Error when using Beautiful Soup in Python

My code works fine. However with certain data my code gives error. the problematic data is:T turns 10 this month. In honor of the anniversary and the upcoming T@10 Issue, this series looks back at some of the most memorable stories from the magazines first decade.

the reported problem is

Traceback (most recent call last): File"/Users/mas/Documents/workspace/DeepLearning/BagOfWords.py", line 41, in clean_train_reviews.append("".join(KaggleWord2VecUtility.review_to_wordlist(train["Snippet"][i], True))) File"/Users/mas/Documents/workspace/DeepLearning/KaggleWord2VecUtility.py", line 22, in review_to_wordlist review_text = BeautifulSoup(review).get_text() File "/Library/Python/2.7/site-packages/bs4/init.py", line 162, ininit elif len(markup) <= 256: TypeError: object of type 'float' has no len()

the code :

def deprecated_argument(old_name, new_name):        if old_name in kwargs:            warnings.warn('The "%s" argument to the BeautifulSoup constructor ''has been renamed to "%s."' % (old_name, new_name))            value = kwargs[old_name]            del kwargs[old_name]            return value        return None    parse_only = parse_only or deprecated_argument("parseOnlyThese", "parse_only")    from_encoding = from_encoding or deprecated_argument("fromEncoding", "from_encoding")    if len(kwargs) > 0:        arg = kwargs.keys().pop()        raise TypeError("__init__() got an unexpected keyword argument '%s'" % arg)    if builder is None:        if isinstance(features, basestring):            features = [features]        if features is None or len(features) == 0:            features = self.DEFAULT_BUILDER_FEATURES        builder_class = builder_registry.lookup(*features)        if builder_class is None:            raise FeatureNotFound("Couldn't find a tree builder with the features you ""requested: %s. Do you need to install a parser library?"                % ",".join(features))        builder = builder_class()    self.builder = builder    self.is_xml = builder.is_xml    self.builder.soup = self    self.parse_only = parse_only    if hasattr(markup, 'read'):        # It's a file-type object.        markup = markup.read()    elif len(markup) <= 256:        # Print out warnings for a couple beginner problems        # involving passing non-markup to Beautiful Soup.        # Beautiful Soup will still parse the input as markup,        # just in case that's what the user really wants.        if (isinstance(markup, unicode)            and not os.path.supports_unicode_filenames):            possible_filename = markup.encode("utf8")        else:            possible_filename = markup        is_file = False        try:            is_file = os.path.exists(possible_filename)        except Exception, e:            # This is almost certainly a problem involving            # characters not valid in filenames on this            # system. Just let it go.            pass        if is_file:            warnings.warn('"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)        if markup[:5] == "http:" or markup[:6] == "https:":            # TODO: This is ugly but I couldn't get it to work in            # Python 3 otherwise.            if ((isinstance(markup, bytes) and not b'' in markup)                or (isinstance(markup, unicode) and not u'' in markup)):                warnings.warn('"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)    for (self.markup, self.original_encoding, self.declared_html_encoding,     self.contains_replacement_characters) in (        self.builder.prepare_markup(markup, from_encoding)):        self.reset()        try:            self._feed()            break        except ParserRejectedMarkup:            pass    # Clear out the markup and remove the builder's circular    # reference to this object.    self.markup = None    self.builder.soup = None

this is my main code:

import osfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.ensemble import RandomForestClassifierfrom KaggleWord2VecUtility import KaggleWord2VecUtilityimport pandas as pdimport numpy as npif __name__ == '__main__':    train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTrain.csv'), header=0)    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTest.csv'), header=0)    print 'A sample Abstract is:'    print train["Abstract"][2838]    print 'A sample Snippet is:'    print train["Snippet"][2838]    #raw_input("Press Enter to continue...")    #print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'    #nltk.download()  # Download text data sets, including stop words    # Initialize an empty list to hold the clean reviews    clean_train_reviews = []    # Loop over each review; create an index i that goes from 0 to the length    # of the movie review list    print len(train["Snippet"])    print "Cleaning and parsing the training set abstracts...\n"    for i in xrange( 0, 3000):        clean_train_reviews.append("".join(KaggleWord2VecUtility.review_to_wordlist(train["Snippet"][i], True)))        if not train["Snippet"][i]:            print i  #  

Viewing all articles
Browse latest Browse all 3

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>