def __init__(self, font_path=None, width=400, height=200,
margin=2,
ranks_only=None, prefer_horizontal=.9, mask=None, scale=1,
color_func=None, max_words=200, min_font_size=4,
stopwords=None, random_state=None,
background_color='black',
max_font_size=None, font_step=1, mode="RGB",
relative_scaling=.5, regexp=None, collocations=True,
colormap=None, normalize_plurals=True):
if font_path is None:
font_path = FONT_PATH
if color_func is None and colormap is None:
# we need a color map
import matplotlib
version = matplotlib.__version__
if version[0] < "2" and version[2] < "5":
colormap = "hsv"
else:
colormap = "viridis"
self.colormap = colormap
self.collocations = collocations
self.font_path = font_path
self.width = width
self.height = height
self.margin = margin
self.prefer_horizontal = prefer_horizontal
self.mask = mask
self.scale = scale
self.color_func = color_func or colormap_color_func(colormap)
self.max_words = max_words
self.stopwords = stopwords if stopwords is not None else
STOPWORDS
self.min_font_size = min_font_size
self.font_step = font_step
self.regexp = regexp
if isinstance(random_state, int):
random_state = Random(random_state)
self.random_state = random_state
self.background_color = background_color
self.max_font_size = max_font_size
self.mode = mode
if relative_scaling < 0 or relative_scaling > 1:
raise ValueError(
"relative_scaling needs to be "
"between 0 and 1, got %f." %
relative_scaling)
self.relative_scaling = relative_scaling
if ranks_only is not None:
warnings.warn("ranks_only is deprecated and will be
removed as"
" it had no effect. Look into relative_scaling.",
DeprecationWarning)
self.normalize_plurals = normalize_plurals
def fit_words(self, frequencies):
"""Create a word_cloud from words and frequencies.
Alias to generate_from_frequencies.
Parameters
----------
frequencies : dict from string to float
A contains words and associated frequency.
Returns
-------
self
"""
return self.generate_from_frequencies(frequencies)
def generate_from_frequencies(self, frequencies,
max_font_size=None):
"""Create a word_cloud from words and frequencies. Parameters
----------
frequencies : dict from string to float
A contains words and associated frequency.
max_font_size : int
Use this font-size instead of self.max_font_size
Returns
-------
self
"""
# make sure frequencies are sorted and normalized
frequencies = sorted(frequencies.items(), key=itemgetter(1),
reverse=True)
if len(frequencies) <= 0:
raise ValueError("We need at least 1 word to plot a word
cloud, "
"got %d." %
len(frequencies))
frequencies = frequencies[:self.max_words] # largest entry will
be 1
max_frequency = float(frequencies[0][1])
frequencies = [(word, freq / max_frequency) for
word, freq in frequencies]
if self.random_state is not None:
random_state = self.random_state
else:
random_state = Random()
if self.mask is not None:
mask = self.mask
width = mask.shape[1]
height = mask.shape[0]
if mask.dtype.kind == 'f':
warnings.warn("mask image should be unsigned byte
between 0"
" and 255. Got a float array")
if mask.ndim == 2:
boolean_mask = mask == 255
elif mask.ndim == 3: # if all channels are white, mask out
:::3]255, axis=-1)
else:
boolean_mask = np.all(mask[ ==
raise ValueError("Got mask of invalid shape: %s" %
str(mask.shape))
else:
boolean_mask = None
height, width = self.height, self.width
occupancy = IntegralOccupancyMap(height, width,
boolean_mask)
# create image
img_grey = Image.new("L", (width, height))
draw = ImageDraw.Draw(img_grey)
img_array = np.asarray(img_grey)
font_sizes, positions, orientations, colors = [], [], [], []
last_freq = 1.
if max_font_size is None:
# if not provided use default font_size
max_font_size = self.max_font_size
if max_font_size is None:
# figure out a good font size by trying to draw with
# just the first two words
if len(frequencies) == 1:
# we only have one word. We make it big!
font_size = self.height
else:
self.generate_from_frequencies(dict(frequencies[:2]),
max_font_size=self.height)
# find font sizes
sizes = [x[1] for x in self.layout_]
try:
font_size = int(2 * sizes[0] * sizes[1] /
(sizes[0] + sizes[1]))
# quick fix for if self.layout_ contains less than 2 values
# on very small images it can be empty
except IndexError:
try:
font_size = sizes[0]
except IndexError:
raise ValueError('canvas size is too small')
else:
font_size = max_font_size
# we set self.words_ here because we called
generate_from_frequencies
# above... hurray for good design?
self.words_ = dict(frequencies)
# start drawing grey image
for word, freq in frequencies:
# select the font size
rs = self.relative_scaling
if rs != 0:
font_size = int(round((rs * (freq / float(last_freq)) +
(1 - rs)) * font_size))
if random_state.random() < self.prefer_horizontal:
orientation = None
else:
orientation = Image.ROTATE_90
tried_other_orientation = False
while True:
# try to find a position
font = ImageFont.truetype(self.font_path, font_size)
# transpose font optionally
transposed_font = ImageFont.TransposedFont(
font, orientation=orientation)
# get size of resulting text
box_size = draw.textsize(word, font=transposed_font)
# find possible places using integral image:
result = occupancy.sample_position(box_size[1] + self.
margin,
box_size[0] + self.margin,
random_state)
if result is not None or font_size < self.min_font_size:
# either we found a place or font-size went too small
break
# if we didn't find a place, make font smaller
# but first try to rotate!
if not tried_other_orientation and self.prefer_horizontal 1:
orientation = Image.ROTATE_90 if orientation is None
else Image.ROTATE_90
tried_other_orientation = True
else:
font_size -= self.font_step
orientation = None
if font_size < self.min_font_size:
# we were unable to draw any more
break
x, y = np.array(result) + self.margin // 2
# actually draw the text
draw.text((y, x), word, fill="white", font=transposed_font)
positions.append((x, y))
orientations.append(orientation)
font_sizes.append(font_size)
colors.append(self.color_func(word, font_size=font_size,
position=(x, y),
orientation=orientation,
random_state=random_state,
font_path=self.font_path))
# recompute integral image
if self.mask is None:
img_array = np.asarray(img_grey)
else:
img_array = np.asarray(img_grey) + boolean_mask
# recompute bottom right
# the order of the cumsum's is important for speed ?!
occupancy.update(img_array, x, y)
last_freq = freq
self.layout_ = list(zip(frequencies, font_sizes, positions,
orientations, colors))
return self
def process_text(self, text):
"""Splits a long text into words, eliminates the stopwords.
Parameters
----------
text : string
The text to be processed.
Returns
-------
words : dict (string, int)
Word tokens with associated frequency.
..versionchanged:: 1.2.2
Changed return type from list of tuples to dict.
Notes
-----
There are better ways to do word tokenization, but I don't
want to
include all those things.
"""
stopwords = set([i.lower() for i in self.stopwords])
flags = re.UNICODE if sys.version < '3' and type(text) is unicode
else 0
regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
words = re.findall(regexp, text, flags)
# remove stopwords
words = [word for word in words if word.lower() not in
stopwords]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word for
word in words]
# remove numbers
words = [word for word in words if not word.isdigit()]
if self.collocations:
word_counts = unigrams_and_bigrams(words, self.
normalize_plurals)
else:
word_counts, _ = process_tokens(words, self.
normalize_plurals)
return word_counts
def generate_from_text(self, text):
"""Generate wordcloud from text.
The input "text" is expected to be a natural text. If you pass a
sorted
list of words, words will appear in your output twice. To
remove this
duplication, set ``collocations=False``.
Calls process_text and generate_from_frequencies.
..versionchanged:: 1.2.2
Argument of generate_from_frequencies() is not return of
process_text() any more.
Returns
-------
self
"""
words = self.process_text(text)
self.generate_from_frequencies(words)
return self
def generate(self, text):
"""Generate wordcloud from text.
The input "text" is expected to be a natural text. If you pass a
sorted
list of words, words will appear in your output twice. To
remove this
duplication, set ``collocations=False``.
Alias to generate_from_text.
Calls process_text and generate_from_frequencies.
Returns
-------
self
"""
return self.generate_from_text(text)
def _check_generated(self):
"""Check if ``layout_`` was computed, otherwise raise error."""
if not hasattr(self, "layout_"):
raise ValueError("WordCloud has not been calculated, call
generate"
" first.")
def to_image(self):
self._check_generated()
if self.mask is not None:
width = self.mask.shape[1]
height = self.mask.shape[0]
else:
height, width = self.height, self.width
img = Image.new(self.mode, (int(width * self.scale),
int(height * self.scale)),
self.background_color)
draw = ImageDraw.Draw(img)
for (word, count), font_size, position, orientation, color in self.
layout_:
font = ImageFont.truetype(self.font_path,
int(font_size * self.scale))
transposed_font = ImageFont.TransposedFont(
font, orientation=orientation)
pos = int(position[1] * self.scale), int(position[0] * self.scale)
draw.text(pos, word, fill=color, font=transposed_font)
return img
def recolor(self, random_state=None, color_func=None,
colormap=None):
"""Recolor existing layout.
Applying a new coloring is much faster than generating the
whole
wordcloud.
Parameters
----------
random_state : RandomState, int, or None, default=None
If not None, a fixed random state is used. If an int is given,
this
is used as seed for a random.Random state.
color_func : function or None, default=None
Function to generate new color from word count, font size,
position
and orientation. If None, self.color_func is used.
colormap : string or matplotlib colormap, default=None
Use this colormap to generate new colors. Ignored if
color_func
is specified. If None, self.color_func (or self.color_map) is
used.
Returns
-------
self
"""
if isinstance(random_state, int):
random_state = Random(random_state)
self._check_generated()
if color_func is None:
if colormap is None:
color_func = self.color_func
else:
color_func = colormap_color_func(colormap)
self.layout_ = [(word_freq, font_size, position, orientation,
color_func(word=word_freq[0], font_size=font_size,
position=position, orientation=orientation,
random_state=random_state,
font_path=self.font_path)) for
word_freq, font_size, position, orientation, _ in
self.layout_]
return self
def to_file(self, filename):
"""Export to image file.
Parameters
----------
filename : string
Location to write to.
Returns
-------
self
"""
img = self.to_image()
img.save(filename, optimize=True)
return self
def to_array(self):
"""Convert to numpy array.
Returns
-------
image : nd-array size (width, height, 3)
Word cloud image as numpy matrix.
"""
return np.array(self.to_image())
def __array__(self):
"""Convert to numpy array.
Returns
-------
image : nd-array size (width, height, 3)
Word cloud image as numpy matrix.
"""
return self.to_array()
def to_html(self):
raise NotImplementedError("FIXME!!!")