My code works fine. However with certain data my code gives error. the problematic data is:T turns 10 this month. In honor of the anniversary and the upcoming T@10 Issue, this series looks back at some of the most memorable stories from the magazines first decade.
the reported problem is
Traceback (most recent call last): File"/Users/mas/Documents/workspace/DeepLearning/BagOfWords.py", line 41, in clean_train_reviews.append("".join(KaggleWord2VecUtility.review_to_wordlist(train["Snippet"][i], True))) File"/Users/mas/Documents/workspace/DeepLearning/KaggleWord2VecUtility.py", line 22, in review_to_wordlist review_text = BeautifulSoup(review).get_text() File "/Library/Python/2.7/site-packages/bs4/init.py", line 162, ininit elif len(markup) <= 256: TypeError: object of type 'float' has no len()
the code :
def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn('The "%s" argument to the BeautifulSoup constructor ''has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument("parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument("fromEncoding", "from_encoding") if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError("__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise FeatureNotFound("Couldn't find a tree builder with the features you ""requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256: # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. if (isinstance(markup, unicode) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: possible_filename = markup is_file = False try: is_file = os.path.exists(possible_filename) except Exception, e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass if is_file: warnings.warn('"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) if markup[:5] == "http:" or markup[:6] == "https:": # TODO: This is ugly but I couldn't get it to work in # Python 3 otherwise. if ((isinstance(markup, bytes) and not b'' in markup) or (isinstance(markup, unicode) and not u'' in markup)): warnings.warn('"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( self.builder.prepare_markup(markup, from_encoding)): self.reset() try: self._feed() break except ParserRejectedMarkup: pass # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None self.builder.soup = None
this is my main code:
import osfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.ensemble import RandomForestClassifierfrom KaggleWord2VecUtility import KaggleWord2VecUtilityimport pandas as pdimport numpy as npif __name__ == '__main__': train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTrain.csv'), header=0) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTest.csv'), header=0) print 'A sample Abstract is:' print train["Abstract"][2838] print 'A sample Snippet is:' print train["Snippet"][2838] #raw_input("Press Enter to continue...") #print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...' #nltk.download() # Download text data sets, including stop words # Initialize an empty list to hold the clean reviews clean_train_reviews = [] # Loop over each review; create an index i that goes from 0 to the length # of the movie review list print len(train["Snippet"]) print "Cleaning and parsing the training set abstracts...\n" for i in xrange( 0, 3000): clean_train_reviews.append("".join(KaggleWord2VecUtility.review_to_wordlist(train["Snippet"][i], True))) if not train["Snippet"][i]: print i #